From 7b64060c07c323d6ce89cddb1cf2c88858e04c92 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Wed, 15 Apr 2026 16:44:38 +0200 Subject: [PATCH 01/37] [SRC] Move the refill xbar into the group level for future scaling. --- hardware/src/cachepool_cluster.sv | 358 ++++-------------------------- hardware/src/cachepool_group.sv | 324 ++++++++++++++++++++++++++- 2 files changed, 353 insertions(+), 329 deletions(-) diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index df687a3..8402b83 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -218,12 +218,13 @@ module cachepool_cluster // Wire Definitions // ---------------- // 1. AXI - axi_mst_cache_req_t [NumTiles-1:0][TileNarrowAxiPorts-1:0] axi_tile_req; - axi_mst_cache_resp_t [NumTiles-1:0][TileNarrowAxiPorts-1:0] axi_tile_rsp; - axi_slv_cache_req_t [ClusterWideOutAxiPorts-1 :0] wide_axi_slv_req; - axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1 :0] wide_axi_slv_rsp; - axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; - axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; + // BootROM wide AXI from group (one per tile, BootROM only) + axi_mst_cache_req_t [NumTiles-1:0] axi_tile_req; + axi_mst_cache_resp_t [NumTiles-1:0] axi_tile_rsp; + axi_slv_cache_req_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_req; + axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_rsp; + axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; + axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; // 2. BootROM reg_cache_req_t [NumTiles-1:0] bootrom_reg_req; @@ -245,125 +246,9 @@ module cachepool_cluster // CachePool Tile // --------------- - cache_trans_req_t [NumL1CacheCtrl-1 :0] cache_refill_req; - cache_trans_rsp_t [NumL1CacheCtrl-1 :0] cache_refill_rsp; - - cache_trans_req_t [NumTiles-1 :0] cache_core_req; - cache_trans_rsp_t [NumTiles-1 :0] cache_core_rsp; - - cache_trans_req_chan_t [NumTiles*NumClusterMst-1 :0] tile_req_chan; - cache_trans_rsp_chan_t [NumTiles*NumClusterMst-1 :0] tile_rsp_chan; - logic [NumTiles*NumClusterMst-1 :0] tile_req_valid, tile_req_ready, tile_rsp_valid, tile_rsp_ready; - - l2_req_t [ClusterWideOutAxiPorts-1 :0] l2_req; - l2_rsp_t [ClusterWideOutAxiPorts-1 :0] l2_rsp; - - cache_trans_req_chan_t [ClusterWideOutAxiPorts-1 :0] l2_req_chan; - cache_trans_rsp_chan_t [ClusterWideOutAxiPorts-1 :0] l2_rsp_chan; - logic [ClusterWideOutAxiPorts-1 :0] l2_req_valid, l2_req_ready , l2_rsp_valid, l2_rsp_ready; - - typedef logic [$clog2(NumClusterMst*NumTiles)-1:0] l2_sel_t; - // one more bit for out-of-range alert - typedef logic [$clog2(ClusterWideOutAxiPorts) :0] tile_sel_err_t; - typedef logic [$clog2(ClusterWideOutAxiPorts)-1:0] tile_sel_t; - - // Which l2 we want to select for each req - tile_sel_err_t [NumTiles*NumClusterMst-1 :0] tile_sel_err; - tile_sel_t [NumTiles*NumClusterMst-1 :0] tile_sel; - // Which tile we selected for each req - l2_sel_t [ClusterWideOutAxiPorts-1 :0] tile_selected; - // which tile we want to select for each rsp - l2_sel_t [ClusterWideOutAxiPorts-1 :0] l2_sel; - // What is the priority for response wiring? - // Here we want to make sure the responses from one burst - // continues until done - // If the rsp is a burst with blen != 0, then we will keep - // the rr same, until got a burst rsp with blen == 0 - tile_sel_t [NumTiles*NumClusterMst-1 :0] l2_rsp_rr; - - logic [NumTiles*NumClusterMst-1 :0] rr_lock_d, rr_lock_q; - tile_sel_t [NumTiles*NumClusterMst-1 :0] l2_prio_d, l2_prio_q; - - - l2_sel_t [ClusterWideOutAxiPorts-1:0] port_id; - - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - assign port_id[i] = l2_rsp[i].p.user.tile_id * NumClusterMst + l2_rsp[i].p.user.bank_id; - end - - - if (Burst_Enable) begin : gen_burst_ext_sel - `FF(rr_lock_q, rr_lock_d, 1'b0) - `FF(l2_prio_q, l2_prio_d, 1'b0) - - for (genvar port = 0; port < NumTiles*NumClusterMst; port ++) begin : gen_rsp_rr - tile_sel_t l2_rr; - logic [ClusterWideOutAxiPorts-1:0] arb_valid; - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - // Used to check the round-robin selection - assign arb_valid[i] = (port_id[i] == port) & l2_rsp_valid[i]; - end - - always_comb begin - l2_prio_d[port] = l2_prio_q[port]; - rr_lock_d[port] = rr_lock_q[port]; - - // Determine the priority we give - // round-robin or locked to previous value? - if (|arb_valid) begin - if (rr_lock_q[port]) begin - // rr is locked because of burst - l2_prio_d[port] = l2_prio_q[port]; - end else begin - l2_prio_d[port] = l2_rr; - end - end - // assigned to xbar rr_i - l2_rsp_rr[port] = l2_prio_d[port]; - - // Lock judgement - // Should it work on the l2_rsp instead of tile_rsp? - if (tile_rsp_chan[port].user.burst.is_burst & |arb_valid) begin - // We got a burst response - if (tile_rsp_chan[port].user.burst.burst_len == 0) begin - // this is the last transaction within a burt, remove lock - rr_lock_d[port] = 1'b0; - end else begin - // the burst response is not finished yet, lock the rr - rr_lock_d[port] = 1'b1; - end - end - end - - // We use the rr_arb_tree to get the round-robin selection - // No data is needed here, only need the handshaking - rr_arb_tree #( - .NumIn ( ClusterWideOutAxiPorts ), - .DataType ( logic ), - .ExtPrio ( 1'b0 ), - .AxiVldRdy ( 1'b1 ), - .LockIn ( 1'b1 ) - ) i_rr_arb_tree ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .flush_i ( '0 ), - .rr_i ( '0 ), - .req_i ( arb_valid ), - .gnt_o ( /*not used*/ ), - .data_i ( '0 ), - .req_o ( /*not used*/ ), - .gnt_i ( tile_rsp_ready[port] ), - .data_o ( /*not used*/ ), - .idx_o ( l2_rr ) - ); - end - end else begin - assign l2_prio_d = '0; - assign l2_prio_q = '0; - assign rr_lock_d = '0; - assign rr_lock_q = '0; - assign l2_rsp_rr = '0; - end + // l2 reqrsp ports from the group (one per L2 channel) + l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req; + l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp; if (NumTiles > 1) begin : gen_group cachepool_group #( @@ -419,11 +304,12 @@ module cachepool_cluster .private_start_addr_i ( private_start_addr ), .axi_narrow_req_o ( axi_out_req ), .axi_narrow_rsp_i ( axi_out_resp ), - .axi_wide_req_o ( axi_tile_req ), - .axi_wide_rsp_i ( axi_tile_rsp ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req ), - .cache_refill_rsp_i ( cache_refill_rsp ), + // BootROM wide AXI (one per tile, BootROM only) + .axi_wide_req_o ( axi_tile_req ), + .axi_wide_rsp_i ( axi_tile_rsp ), + // DRAM refill reqrsp (post-xbar, one per L2 channel) + .l2_req_o ( l2_req ), + .l2_rsp_i ( l2_rsp ), // Peripherals .icache_events_o ( icache_events ), .icache_prefetch_enable_i ( icache_prefetch_enable ), @@ -435,32 +321,18 @@ module cachepool_cluster .l1d_insn_ready_o ( l1d_insn_ready ), .l1d_busy_i ( l1d_busy ) ); - // TODO: 2 axi ports converted lost correct assignments - // 1. tile id? - // 2. mux then convert? - for (genvar t = 0; t < NumTiles; t ++) begin : gen_axi_converter - axi_to_reqrsp #( - .axi_req_t ( axi_mst_cache_req_t ), - .axi_rsp_t ( axi_mst_cache_resp_t ), - .AddrWidth ( AxiAddrWidth ), - .DataWidth ( AxiDataWidth ), - .UserWidth ( $bits(refill_user_t) ), - .IdWidth ( AxiIdWidthIn ), - .BufDepth ( NumSpatzOutstandingLoads ), - .reqrsp_req_t ( cache_trans_req_t ), - .reqrsp_rsp_t ( cache_trans_rsp_t ) - ) i_axi2reqrsp ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .busy_o ( ), - .axi_req_i ( axi_tile_req [t][TileMem] ), - .axi_rsp_o ( axi_tile_rsp [t][TileMem] ), - .reqrsp_req_o ( cache_core_req[t] ), - .reqrsp_rsp_i ( cache_core_rsp[t] ) - ); - end end else begin : gen_tile + // Signals used by gen_tile (single-tile path, not currently active) + cache_trans_req_t [NumL1CacheCtrl-1:0] cache_refill_req; + cache_trans_rsp_t [NumL1CacheCtrl-1:0] cache_refill_rsp; + cache_trans_req_t [NumTiles-1:0] cache_core_req; + cache_trans_rsp_t [NumTiles-1:0] cache_core_rsp; + + // TODO: gen_tile TileMem path — needs its own axi_tile_mem signals once fully migrated + axi_mst_cache_req_t gen_tile_mem_req; + axi_mst_cache_resp_t gen_tile_mem_rsp; + cachepool_tile #( .AxiAddrWidth ( AxiAddrWidth ), .AxiDataWidth ( AxiDataWidth ), @@ -527,8 +399,8 @@ module cachepool_cluster // Cache Refill Ports .cache_refill_req_o ( cache_refill_req ), .cache_refill_rsp_i ( cache_refill_rsp ), - .axi_wide_req_o ( axi_tile_req[0] ), - .axi_wide_rsp_i ( axi_tile_rsp[0] ), + .axi_wide_req_o ( {gen_tile_mem_req, axi_tile_req[0]} ), + .axi_wide_rsp_i ( {gen_tile_mem_rsp, axi_tile_rsp[0]} ), // Peripherals .icache_events_o ( icache_events ), .icache_prefetch_enable_i ( icache_prefetch_enable ), @@ -555,160 +427,16 @@ module cachepool_cluster .clk_i ( clk_i ), .rst_ni ( rst_ni ), .busy_o ( ), - .axi_req_i ( axi_tile_req [0][TileMem] ), - .axi_rsp_o ( axi_tile_rsp [0][TileMem] ), + .axi_req_i ( gen_tile_mem_req ), + .axi_rsp_o ( gen_tile_mem_rsp ), .reqrsp_req_o ( cache_core_req[0] ), .reqrsp_rsp_i ( cache_core_rsp[0] ) ); end - // Additional one port for iCache connection - localparam int unsigned ReqrspPortsTile = NumL1CtrlTile + 1; - always_comb begin - for (int t = 0; t < NumTiles; t++) begin - for (int p = 0; p < ReqrspPortsTile; p++) begin - automatic int unsigned xbar_idx = t*ReqrspPortsTile + p; - automatic int unsigned refill_idx = t*NumL1CtrlTile + p-1; - - if (p == 0) begin - // connect_icache_path - tile_req_chan [xbar_idx] = cache_core_req [t].q; - // Scrmable address - tile_req_chan [xbar_idx].addr = scrambleAddr(cache_core_req[t].q.addr); - tile_req_valid [xbar_idx] = cache_core_req [t].q_valid; - cache_core_rsp [t].q_ready = tile_req_ready [xbar_idx]; - - cache_core_rsp [t].p = tile_rsp_chan [xbar_idx]; - cache_core_rsp [t].p_valid = tile_rsp_valid [xbar_idx]; - tile_rsp_ready [xbar_idx] = cache_core_req [t].p_ready; - // Tile ID assignment - tile_req_chan [xbar_idx].user.tile_id = t; - end else begin - // connect_refill_path - tile_req_chan [xbar_idx] = cache_refill_req[refill_idx].q; - // Scramble address - tile_req_chan [xbar_idx].addr = scrambleAddr(cache_refill_req[refill_idx].q.addr); - tile_req_valid [xbar_idx] = cache_refill_req[refill_idx].q_valid; - cache_refill_rsp[refill_idx].q_ready = tile_req_ready [xbar_idx]; - - cache_refill_rsp[refill_idx].p = tile_rsp_chan [xbar_idx]; - cache_refill_rsp[refill_idx].p_valid = tile_rsp_valid [xbar_idx]; - tile_rsp_ready [xbar_idx] = cache_refill_req[refill_idx].p_ready; - // Tile ID assignment - tile_req_chan [xbar_idx].user.tile_id = t; - end - end - end - end - - typedef struct packed { - int unsigned idx; - logic [AxiAddrWidth-1:0] base; - logic [AxiAddrWidth-1:0] mask; - } reqrsp_rule_t; - - reqrsp_rule_t [ClusterWideOutAxiPorts-1:0] xbar_rule; - - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - assign xbar_rule[i] = '{ - idx : i, - base : DramAddr + DramPerChSize * i, - mask : ({AxiAddrWidth{1'b1}} << $clog2(DramPerChSize)) - }; - end - - logic [$clog2(ClusterWideOutAxiPorts):0] default_idx; - assign default_idx = ClusterWideOutAxiPorts; - - for (genvar inp = 0; inp < NumClusterMst*NumTiles; inp ++) begin : gen_xbar_sel - addr_decode_napot #( - .NoIndices (ClusterWideOutAxiPorts+1 ), - .NoRules (ClusterWideOutAxiPorts ), - .addr_t (axi_addr_t ), - .rule_t (reqrsp_rule_t ) - ) i_snitch_decode_napot ( - .addr_i (tile_req_chan[inp].addr), - .addr_map_i (xbar_rule ), - .idx_o (tile_sel_err[inp] ), - .dec_valid_o (/* Unused */ ), - .dec_error_o (/* Unused */ ), - .en_default_idx_i (1'b1 ), - .default_idx_i (default_idx ) - ); - - assign tile_sel[inp] = tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)-1:0]; - -`ifndef TARGET_SYNTHESIS - // Alert the system that we have illegal memory access - IllegalMemAccess : assert property( - @(posedge clk_i) disable iff (!rst_ni) (tile_req_valid[inp] |-> !tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)])) - else $error("Visited illegal address: time=%0t, port=%0d, addr=0x%08h", $time, inp, tile_req_chan[inp].addr); - // else $fatal (1, "Visited address is not mapped"); -`endif - end - - reqrsp_xbar #( - .NumInp (NumClusterMst*NumTiles ), - .NumOut (ClusterWideOutAxiPorts ), - .PipeReg (1'b1 ), - .ExtReqPrio (1'b0 ), - .ExtRspPrio (Burst_Enable ), - .tcdm_req_chan_t (cache_trans_req_chan_t ), - .tcdm_rsp_chan_t (cache_trans_rsp_chan_t ) - ) i_cluster_xbar ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .slv_req_i (tile_req_chan ), - .slv_req_valid_i (tile_req_valid ), - .slv_req_ready_o (tile_req_ready ), - .slv_rsp_o (tile_rsp_chan ), - .slv_rsp_valid_o (tile_rsp_valid ), - .slv_rsp_ready_i (tile_rsp_ready ), - .slv_sel_i (tile_sel[NumTiles*NumClusterMst-1:0] ), - .slv_rr_i ('0 ), - .slv_selected_o (tile_selected ), - .mst_req_o (l2_req_chan ), - .mst_req_valid_o (l2_req_valid ), - .mst_req_ready_i (l2_req_ready ), - .mst_rsp_i (l2_rsp_chan ), - .mst_rr_i (l2_rsp_rr ), - .mst_rsp_valid_i (l2_rsp_valid ), - .mst_rsp_ready_o (l2_rsp_ready ), - .mst_sel_i (l2_sel ) - ); - - for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin - // To L2 Channels - always_comb begin - l2_req[ch].q = '{ - addr : l2_req_chan[ch].addr, - write: l2_req_chan[ch].write, - amo : l2_req_chan[ch].amo, - data : l2_req_chan[ch].data, - strb : l2_req_chan[ch].strb, - size : l2_req_chan[ch].size, - default: '0 - }; - l2_req[ch].q.user = l2_req_chan[ch].user; - l2_req[ch].q_valid = l2_req_valid[ch] ; - l2_req_ready[ch] = l2_rsp[ch].q_ready; - - l2_rsp_chan [ch] = '{ - data : l2_rsp[ch].p.data, - error: l2_rsp[ch].p.error, - write: l2_rsp[ch].p.write, - default: '0 - }; - l2_rsp_chan [ch].user = l2_rsp[ch].p.user; - l2_rsp_valid[ch] = l2_rsp[ch].p_valid; - l2_req[ch].p_ready = l2_rsp_ready[ch]; - // calculate the port from the tile id and bank id - // bank_id == 0 --- bypass - // bank_id == 1-4 --- cache bank 0-3 - l2_sel[ch] = l2_rsp[ch].p.user.tile_id * NumClusterMst + l2_rsp[ch].p.user.bank_id; - end - end - + // ------------- + // To Main Memory: reqrsp_to_axi + output cut, consuming group l2 reqrsp ports + // ------------- for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch ++) begin : gen_output_axi reqrsp_to_axi #( .MaxTrans (NumSpatzOutstandingLoads*2 ), @@ -723,7 +451,7 @@ module cachepool_cluster .reqrsp_rsp_t (l2_rsp_t ), .axi_req_t (axi_slv_cache_req_t ), .axi_rsp_t (axi_slv_cache_resp_t ) - ) i_reqrsp2axi ( + ) i_reqrsp2axi ( .clk_i (clk_i ), .rst_ni (rst_ni ), .user_i (l2_req[ch].q.user ), @@ -734,10 +462,6 @@ module cachepool_cluster ); end - - // ------------- - // To Main Memory - // ------------- // Optionally decouple the external wide AXI master port. for (genvar port = 0; port < ClusterWideOutAxiPorts; port ++) begin : gen_axi_out_cut axi_cut #( @@ -825,13 +549,13 @@ module cachepool_cluster .reg_req_t (reg_cache_req_t ), .reg_rsp_t (reg_cache_rsp_t ) ) i_axi_to_reg_bootrom ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (1'b0 ), - .axi_req_i (axi_tile_req[t][TileBootROM] ), - .axi_rsp_o (axi_tile_rsp[t][TileBootROM] ), - .reg_req_o (bootrom_reg_req[t] ), - .reg_rsp_i (bootrom_reg_rsp[t] ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .axi_req_i (axi_tile_req[t] ), + .axi_rsp_o (axi_tile_rsp[t] ), + .reg_req_o (bootrom_reg_req[t] ), + .reg_rsp_i (bootrom_reg_rsp[t] ) ); bootrom i_bootrom ( diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index b14d1ac..32b3c46 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -139,13 +139,13 @@ module cachepool_group /// AXI Narrow out-port (UART/Peripheral) output axi_narrow_req_t [GroupNarrowAxiPorts-1:0] axi_narrow_req_o, input axi_narrow_resp_t [GroupNarrowAxiPorts-1:0] axi_narrow_rsp_i, - /// Wide AXI ports to cluster level - output axi_out_req_t [GroupWideAxiPorts-1:0] axi_wide_req_o, - input axi_out_resp_t [GroupWideAxiPorts-1:0] axi_wide_rsp_i, + /// Wide AXI ports to cluster level (BootROM only, one per tile) + output axi_out_req_t [NumTiles-1:0] axi_wide_req_o, + input axi_out_resp_t [NumTiles-1:0] axi_wide_rsp_i, - /// Cache refill ports - output cache_trans_req_t [NumL1CacheCtrl-1:0] cache_refill_req_o, - input cache_trans_rsp_t [NumL1CacheCtrl-1:0] cache_refill_rsp_i, + /// DRAM refill reqrsp ports (post-xbar, one per L2 channel) + output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, + input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, /// Peripheral signals output icache_events_t [NrCores-1:0] icache_events_o, @@ -217,6 +217,306 @@ module cachepool_group logic [NumTiles-1:0] error; assign error_o = |error; + // Internal tile-side wide AXI: split into two flat arrays by port function + // BootROM (TileBootROM=0): goes to cluster output + axi_mst_cache_req_t [NumTiles-1:0] axi_tile_bootrom_req; + axi_mst_cache_resp_t [NumTiles-1:0] axi_tile_bootrom_rsp; + // TileMem (TileMem=1): stays in group, fed into axi_to_reqrsp + axi_mst_cache_req_t [NumTiles-1:0] axi_tile_mem_req; + axi_mst_cache_resp_t [NumTiles-1:0] axi_tile_mem_rsp; + + // BootROM ports routed directly to cluster output (one per tile) + for (genvar t = 0; t < NumTiles; t++) begin : gen_bootrom_passthrough + assign axi_wide_req_o[t] = axi_tile_bootrom_req[t]; + assign axi_tile_bootrom_rsp[t] = axi_wide_rsp_i[t]; + end + + // Cache refill ports from tiles (NumL1CacheCtrl = NumCores total) + cache_trans_req_t [NumL1CacheCtrl-1:0] cache_refill_req; + cache_trans_rsp_t [NumL1CacheCtrl-1:0] cache_refill_rsp; + + // cache_core_req/rsp: icache-bypass path, one per tile (from axi_to_reqrsp) + cache_trans_req_t [NumTiles-1:0] cache_core_req; + cache_trans_rsp_t [NumTiles-1:0] cache_core_rsp; + + // Flat xbar input channels: NumTiles * NumClusterMst ports + cache_trans_req_chan_t [NumTiles*NumClusterMst-1:0] tile_req_chan; + cache_trans_rsp_chan_t [NumTiles*NumClusterMst-1:0] tile_rsp_chan; + logic [NumTiles*NumClusterMst-1:0] tile_req_valid, tile_req_ready, + tile_rsp_valid, tile_rsp_ready; + + // Xbar output channels: one per L2 channel + cache_trans_req_chan_t [ClusterWideOutAxiPorts-1:0] l2_req_chan; + cache_trans_rsp_chan_t [ClusterWideOutAxiPorts-1:0] l2_rsp_chan; + logic [ClusterWideOutAxiPorts-1:0] l2_req_valid, l2_req_ready, + l2_rsp_valid, l2_rsp_ready; + + // Selection types + typedef logic [$clog2(NumClusterMst*NumTiles)-1:0] l2_sel_t; + typedef logic [$clog2(ClusterWideOutAxiPorts) :0] tile_sel_err_t; // one extra bit for OOB + typedef logic [$clog2(ClusterWideOutAxiPorts)-1:0] tile_sel_t; + + tile_sel_err_t [NumTiles*NumClusterMst-1:0] tile_sel_err; + tile_sel_t [NumTiles*NumClusterMst-1:0] tile_sel; + l2_sel_t [ClusterWideOutAxiPorts-1:0] tile_selected; + l2_sel_t [ClusterWideOutAxiPorts-1:0] l2_sel; + tile_sel_t [NumTiles*NumClusterMst-1:0] l2_rsp_rr; + + logic [NumTiles*NumClusterMst-1:0] rr_lock_d, rr_lock_q; + tile_sel_t [NumTiles*NumClusterMst-1:0] l2_prio_d, l2_prio_q; + + // port_id: which xbar input port does each L2 channel response target + l2_sel_t [ClusterWideOutAxiPorts-1:0] port_id; + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign port_id[i] = l2_rsp_i[i].p.user.tile_id * NumClusterMst + + l2_rsp_i[i].p.user.bank_id; + end + + // --------------------- + // axi_to_reqrsp: TileMem (icache-bypass) path, one per tile + // --------------------- + for (genvar t = 0; t < NumTiles; t++) begin : gen_axi_converter + axi_to_reqrsp #( + .axi_req_t ( axi_mst_cache_req_t ), + .axi_rsp_t ( axi_mst_cache_resp_t ), + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .UserWidth ( $bits(refill_user_t) ), + .IdWidth ( AxiIdWidthIn ), + .BufDepth ( NumSpatzOutstandingLoads ), + .reqrsp_req_t ( cache_trans_req_t ), + .reqrsp_rsp_t ( cache_trans_rsp_t ) + ) i_axi2reqrsp ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .busy_o ( ), + .axi_req_i ( axi_tile_mem_req[t] ), + .axi_rsp_o ( axi_tile_mem_rsp[t] ), + .reqrsp_req_o ( cache_core_req[t] ), + .reqrsp_rsp_i ( cache_core_rsp[t] ) + ); + end + + // --------------------- + // Wiring: assemble flat xbar input from icache-bypass and refill paths + // --------------------- + // Port layout per tile: p=0 -> icache-bypass (cache_core_req), + // p=1..NumL1CtrlTile -> refill (cache_refill_req) + localparam int unsigned ReqrspPortsTile = NumL1CtrlTile + 1; + always_comb begin + for (int t = 0; t < NumTiles; t++) begin + for (int p = 0; p < ReqrspPortsTile; p++) begin + automatic int unsigned xbar_idx = t * ReqrspPortsTile + p; + automatic int unsigned refill_idx = t * NumL1CtrlTile + p - 1; + + if (p == 0) begin + // icache-bypass path + tile_req_chan [xbar_idx] = cache_core_req[t].q; + tile_req_chan [xbar_idx].addr = scrambleAddr(cache_core_req[t].q.addr); + tile_req_valid [xbar_idx] = cache_core_req[t].q_valid; + cache_core_rsp [t].q_ready = tile_req_ready[xbar_idx]; + + cache_core_rsp [t].p = tile_rsp_chan [xbar_idx]; + cache_core_rsp [t].p_valid = tile_rsp_valid[xbar_idx]; + tile_rsp_ready [xbar_idx] = cache_core_req[t].p_ready; + tile_req_chan [xbar_idx].user.tile_id = t; + end else begin + // refill path + tile_req_chan [xbar_idx] = cache_refill_req[refill_idx].q; + tile_req_chan [xbar_idx].addr = scrambleAddr(cache_refill_req[refill_idx].q.addr); + tile_req_valid [xbar_idx] = cache_refill_req[refill_idx].q_valid; + cache_refill_rsp[refill_idx].q_ready = tile_req_ready[xbar_idx]; + + cache_refill_rsp[refill_idx].p = tile_rsp_chan [xbar_idx]; + cache_refill_rsp[refill_idx].p_valid = tile_rsp_valid[xbar_idx]; + tile_rsp_ready [xbar_idx] = cache_refill_req[refill_idx].p_ready; + tile_req_chan [xbar_idx].user.tile_id = t; + end + end + end + end + + // --------------------- + // Address decoder: select L2 channel per xbar input port + // --------------------- + typedef struct packed { + int unsigned idx; + logic [AxiAddrWidth-1:0] base; + logic [AxiAddrWidth-1:0] mask; + } reqrsp_rule_t; + + reqrsp_rule_t [ClusterWideOutAxiPorts-1:0] xbar_rule; + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign xbar_rule[i] = '{ + idx : i, + base : DramAddr + DramPerChSize * i, + mask : ({AxiAddrWidth{1'b1}} << $clog2(DramPerChSize)) + }; + end + + logic [$clog2(ClusterWideOutAxiPorts):0] default_idx; + assign default_idx = ClusterWideOutAxiPorts; + + for (genvar inp = 0; inp < NumClusterMst*NumTiles; inp++) begin : gen_xbar_sel + addr_decode_napot #( + .NoIndices ( ClusterWideOutAxiPorts+1 ), + .NoRules ( ClusterWideOutAxiPorts ), + .addr_t ( axi_addr_t ), + .rule_t ( reqrsp_rule_t ) + ) i_snitch_decode_napot ( + .addr_i ( tile_req_chan[inp].addr ), + .addr_map_i ( xbar_rule ), + .idx_o ( tile_sel_err[inp] ), + .dec_valid_o ( /* unused */ ), + .dec_error_o ( /* unused */ ), + .en_default_idx_i ( 1'b1 ), + .default_idx_i ( default_idx ) + ); + assign tile_sel[inp] = tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)-1:0]; + +`ifndef TARGET_SYNTHESIS + IllegalMemAccess : assert property ( + @(posedge clk_i) disable iff (!rst_ni) + (tile_req_valid[inp] |-> !tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)])) + else $error("Visited illegal address: time=%0t, port=%0d, addr=0x%08h", + $time, inp, tile_req_chan[inp].addr); +`endif + end + + // --------------------- + // Burst protection logic + // --------------------- + if (Burst_Enable) begin : gen_burst_ext_sel + `FF(rr_lock_q, rr_lock_d, 1'b0) + `FF(l2_prio_q, l2_prio_d, 1'b0) + + for (genvar port = 0; port < NumTiles*NumClusterMst; port++) begin : gen_rsp_rr + tile_sel_t l2_rr; + logic [ClusterWideOutAxiPorts-1:0] arb_valid; + + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign arb_valid[i] = (port_id[i] == port) & l2_rsp_valid[i]; + end + + always_comb begin + l2_prio_d[port] = l2_prio_q[port]; + rr_lock_d[port] = rr_lock_q[port]; + + if (|arb_valid) begin + if (rr_lock_q[port]) begin + l2_prio_d[port] = l2_prio_q[port]; + end else begin + l2_prio_d[port] = l2_rr; + end + end + l2_rsp_rr[port] = l2_prio_d[port]; + + if (tile_rsp_chan[port].user.burst.is_burst & |arb_valid) begin + if (tile_rsp_chan[port].user.burst.burst_len == 0) begin + rr_lock_d[port] = 1'b0; + end else begin + rr_lock_d[port] = 1'b1; + end + end + end + + rr_arb_tree #( + .NumIn ( ClusterWideOutAxiPorts ), + .DataType ( logic ), + .ExtPrio ( 1'b0 ), + .AxiVldRdy ( 1'b1 ), + .LockIn ( 1'b1 ) + ) i_rr_arb_tree ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .flush_i ( '0 ), + .rr_i ( '0 ), + .req_i ( arb_valid ), + .gnt_o ( /* not used */ ), + .data_i ( '0 ), + .req_o ( /* not used */ ), + .gnt_i ( tile_rsp_ready[port]), + .data_o ( /* not used */ ), + .idx_o ( l2_rr ) + ); + end + end else begin + assign l2_prio_d = '0; + assign l2_prio_q = '0; + assign rr_lock_d = '0; + assign rr_lock_q = '0; + assign l2_rsp_rr = '0; + end + + // --------------------- + // Cluster (DRAM) xbar + // --------------------- + reqrsp_xbar #( + .NumInp ( NumClusterMst*NumTiles ), + .NumOut ( ClusterWideOutAxiPorts ), + .PipeReg ( 1'b1 ), + .ExtReqPrio ( 1'b0 ), + .ExtRspPrio ( Burst_Enable ), + .tcdm_req_chan_t ( cache_trans_req_chan_t ), + .tcdm_rsp_chan_t ( cache_trans_rsp_chan_t ) + ) i_cluster_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( tile_req_chan ), + .slv_req_valid_i ( tile_req_valid ), + .slv_req_ready_o ( tile_req_ready ), + .slv_rsp_o ( tile_rsp_chan ), + .slv_rsp_valid_o ( tile_rsp_valid ), + .slv_rsp_ready_i ( tile_rsp_ready ), + .slv_sel_i ( tile_sel[NumTiles*NumClusterMst-1:0] ), + .slv_rr_i ( '0 ), + .slv_selected_o ( tile_selected ), + .mst_req_o ( l2_req_chan ), + .mst_req_valid_o ( l2_req_valid ), + .mst_req_ready_i ( l2_req_ready ), + .mst_rsp_i ( l2_rsp_chan ), + .mst_rr_i ( l2_rsp_rr ), + .mst_rsp_valid_i ( l2_rsp_valid ), + .mst_rsp_ready_o ( l2_rsp_ready ), + .mst_sel_i ( l2_sel ) + ); + + // --------------------- + // l2_req/rsp packing: bridge xbar channels <-> l2_req_t/l2_rsp_t port + // --------------------- + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_pack + always_comb begin + // Request: xbar -> group output port + l2_req_o[ch].q = '{ + addr : l2_req_chan[ch].addr, + write : l2_req_chan[ch].write, + amo : l2_req_chan[ch].amo, + data : l2_req_chan[ch].data, + strb : l2_req_chan[ch].strb, + size : l2_req_chan[ch].size, + default: '0 + }; + l2_req_o[ch].q.user = l2_req_chan[ch].user; + l2_req_o[ch].q_valid = l2_req_valid[ch]; + l2_req_ready[ch] = l2_rsp_i[ch].q_ready; + + // Response: group input port -> xbar + l2_rsp_chan[ch] = '{ + data : l2_rsp_i[ch].p.data, + error : l2_rsp_i[ch].p.error, + write : l2_rsp_i[ch].p.write, + default: '0 + }; + l2_rsp_chan[ch].user = l2_rsp_i[ch].p.user; + l2_rsp_valid[ch] = l2_rsp_i[ch].p_valid; + l2_req_o[ch].p_ready = l2_rsp_ready[ch]; + + // Response demux: which xbar input port does this response target? + l2_sel[ch] = l2_rsp_i[ch].p.user.tile_id * NumClusterMst + + l2_rsp_i[ch].p.user.bank_id; + end + end + // Tile remote access signals // In/Out relative to the tile (out--leave a tile; in--enter a tile) // Tile-side flat layout: index = j + r*NrTCDMPortsPerCore (j=xbar idx, r=remote slot within xbar) @@ -355,12 +655,12 @@ module cachepool_group .remote_req_i ( tile_remote_in_req [t] ), .remote_rsp_o ( tile_remote_in_rsp [t] ), .remote_rsp_ready_o ( tile_remote_in_ready[t] ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req_o[t*NumL1CtrlTile+:NumL1CtrlTile] ), - .cache_refill_rsp_i ( cache_refill_rsp_i[t*NumL1CtrlTile+:NumL1CtrlTile] ), - // BootROM / Core-side Cache Bypass - .axi_wide_req_o ( axi_wide_req_o [t*TileWideAxiPorts+:TileWideAxiPorts] ), - .axi_wide_rsp_i ( axi_wide_rsp_i [t*TileWideAxiPorts+:TileWideAxiPorts] ), + // Cache Refill Ports (now internal, connected to group-level xbar) + .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), + // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) + .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), + .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), // Peripherals .icache_events_o ( /* unused */ ), .icache_prefetch_enable_i ( icache_prefetch_enable_i ), From c61cfbfcf71c1a5d728eeaebcddc8f7709ca0e28 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 16 Apr 2026 17:08:07 +0200 Subject: [PATCH 02/37] [SRC] Move BootROM inside group level for easier scaling. --- hardware/bootrom/bootrom.elf | Bin 5248 -> 5248 bytes hardware/src/cachepool_cluster.sv | 124 +++++---------------------- hardware/src/cachepool_group.sv | 124 +++++++++++++++++++++------ hardware/src/cachepool_pkg.sv | 17 +++- sim/scripts/vsim_cluster.tcl | 4 - sim/scripts/vsim_group.tcl | 4 + util/auto-benchmark/write_results.py | 2 +- 7 files changed, 139 insertions(+), 136 deletions(-) diff --git a/hardware/bootrom/bootrom.elf b/hardware/bootrom/bootrom.elf index 8c26b6e3aa39cd37fa01fff29408b31ad5525ed5..6b957508d8d33db4fdddc9983021e4b4651b5da4 100755 GIT binary patch delta 19 acmZqBY|z}mEyPx8QE8Cov{_Inp9ugt$OUTv delta 19 acmZqBY|z}mEyU(!RuW|5u~|?kp9ugrgat1E diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index 8402b83..e09e599 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -218,17 +218,10 @@ module cachepool_cluster // Wire Definitions // ---------------- // 1. AXI - // BootROM wide AXI from group (one per tile, BootROM only) - axi_mst_cache_req_t [NumTiles-1:0] axi_tile_req; - axi_mst_cache_resp_t [NumTiles-1:0] axi_tile_rsp; - axi_slv_cache_req_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_req; - axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_rsp; - axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; - axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; - - // 2. BootROM - reg_cache_req_t [NumTiles-1:0] bootrom_reg_req; - reg_cache_rsp_t [NumTiles-1:0] bootrom_reg_rsp; + axi_slv_cache_req_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_req; + axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_rsp; + axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; + axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; // 3. Peripherals axi_addr_t private_start_addr; @@ -304,9 +297,6 @@ module cachepool_cluster .private_start_addr_i ( private_start_addr ), .axi_narrow_req_o ( axi_out_req ), .axi_narrow_rsp_i ( axi_out_resp ), - // BootROM wide AXI (one per tile, BootROM only) - .axi_wide_req_o ( axi_tile_req ), - .axi_wide_rsp_i ( axi_tile_rsp ), // DRAM refill reqrsp (post-xbar, one per L2 channel) .l2_req_o ( l2_req ), .l2_rsp_i ( l2_rsp ), @@ -323,16 +313,8 @@ module cachepool_cluster ); end else begin : gen_tile - // Signals used by gen_tile (single-tile path, not currently active) - cache_trans_req_t [NumL1CacheCtrl-1:0] cache_refill_req; - cache_trans_rsp_t [NumL1CacheCtrl-1:0] cache_refill_rsp; - cache_trans_req_t [NumTiles-1:0] cache_core_req; - cache_trans_rsp_t [NumTiles-1:0] cache_core_rsp; - - // TODO: gen_tile TileMem path — needs its own axi_tile_mem signals once fully migrated - axi_mst_cache_req_t gen_tile_mem_req; - axi_mst_cache_resp_t gen_tile_mem_rsp; - + // TODO: single-tile path not yet migrated to new refill/bootrom datapath. + // This branch is never elaborated in the current configuration (NumTiles > 1 always). cachepool_tile #( .AxiAddrWidth ( AxiAddrWidth ), .AxiDataWidth ( AxiDataWidth ), @@ -388,7 +370,6 @@ module cachepool_cluster .private_start_addr_i ( private_start_addr ), .axi_out_req_o ( axi_out_req [0] ), .axi_out_resp_i ( axi_out_resp [0] ), - // Remote Ports (not used) .remote_req_o ( ), .remote_req_dst_o ( ), .remote_rsp_i ( '0 ), @@ -396,13 +377,11 @@ module cachepool_cluster .remote_req_i ( '0 ), .remote_rsp_o ( ), .remote_rsp_ready_o ( ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req ), - .cache_refill_rsp_i ( cache_refill_rsp ), - .axi_wide_req_o ( {gen_tile_mem_req, axi_tile_req[0]} ), - .axi_wide_rsp_i ( {gen_tile_mem_rsp, axi_tile_rsp[0]} ), - // Peripherals - .icache_events_o ( icache_events ), + .cache_refill_req_o ( ), + .cache_refill_rsp_i ( '0 ), + .axi_wide_req_o ( ), + .axi_wide_rsp_i ( '0 ), + .icache_events_o ( ), .icache_prefetch_enable_i ( icache_prefetch_enable ), .cl_interrupt_i ( cl_interrupt ), .dynamic_offset_i ( dynamic_offset ), @@ -412,26 +391,6 @@ module cachepool_cluster .l1d_insn_ready_o ( l1d_insn_ready ), .l1d_busy_i ( l1d_busy ) ); - - axi_to_reqrsp #( - .axi_req_t ( axi_mst_cache_req_t ), - .axi_rsp_t ( axi_mst_cache_resp_t ), - .AddrWidth ( AxiAddrWidth ), - .DataWidth ( AxiDataWidth ), - .UserWidth ( $bits(refill_user_t) ), - .IdWidth ( AxiIdWidthIn ), - .BufDepth ( NumSpatzOutstandingLoads ), - .reqrsp_req_t ( cache_trans_req_t ), - .reqrsp_rsp_t ( cache_trans_rsp_t ) - ) i_axi2reqrsp ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .busy_o ( ), - .axi_req_i ( gen_tile_mem_req ), - .axi_rsp_o ( gen_tile_mem_rsp ), - .reqrsp_req_o ( cache_core_req[0] ), - .reqrsp_rsp_i ( cache_core_rsp[0] ) - ); end // ------------- @@ -534,43 +493,6 @@ module cachepool_cluster assign axi_out_resp[0][ClusterUart] = axi_narrow_resp_i; end - /***** BootROM ****/ - for (genvar t = 0; t < NumTiles; t++) begin : gen_bootrom - axi_to_reg #( - .ADDR_WIDTH (AxiAddrWidth ), - .DATA_WIDTH (AxiDataWidth ), - .AXI_MAX_WRITE_TXNS (1 ), - .AXI_MAX_READ_TXNS (1 ), - .DECOUPLE_W (0 ), - .ID_WIDTH (WideIdWidthIn ), - .USER_WIDTH (AxiUserWidth ), - .axi_req_t (axi_mst_cache_req_t ), - .axi_rsp_t (axi_mst_cache_resp_t), - .reg_req_t (reg_cache_req_t ), - .reg_rsp_t (reg_cache_rsp_t ) - ) i_axi_to_reg_bootrom ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (1'b0 ), - .axi_req_i (axi_tile_req[t] ), - .axi_rsp_o (axi_tile_rsp[t] ), - .reg_req_o (bootrom_reg_req[t] ), - .reg_rsp_i (bootrom_reg_rsp[t] ) - ); - - bootrom i_bootrom ( - .clk_i (clk_i ), - .req_i (bootrom_reg_req[t].valid ), - .addr_i (addr_t'(bootrom_reg_req[t].addr) ), - .rdata_o(bootrom_reg_rsp[t].rdata ) - ); - - `FF(bootrom_reg_rsp[t].ready, bootrom_reg_req[t].valid, 1'b0) - - assign bootrom_reg_rsp[t].error = 1'b0; - end - - /***** CSR/Peripherals *****/ `REG_BUS_TYPEDEF_ALL(reg, narrow_addr_t, narrow_data_t, narrow_strb_t) @@ -644,13 +566,13 @@ module cachepool_cluster .SpillR ( XbarLatency[0] ), .MaxWTrans ( 2 ) ) i_axi_csr_mux ( - .clk_i ( clk_i ), // Clock - .rst_ni ( rst_ni ), // Asynchronous reset active low - .test_i ('0 ), // Test Mode enable + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ('0 ), .slv_reqs_i ( {axi_in_req_i, axi_core_csr_req} ), .slv_resps_o ( {axi_in_resp_o, axi_core_csr_rsp} ), - .mst_req_o ( axi_csr_req ), - .mst_resp_i ( axi_csr_rsp ) + .mst_req_o ( axi_csr_req ), + .mst_resp_i ( axi_csr_rsp ) ); axi_to_reg #( @@ -666,13 +588,13 @@ module cachepool_cluster .reg_req_t (reg_req_t ), .reg_rsp_t (reg_rsp_t ) ) i_csr_axi_to_reg ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (1'b0 ), - .axi_req_i (axi_csr_req ), - .axi_rsp_o (axi_csr_rsp ), - .reg_req_o (reg_req ), - .reg_rsp_i (reg_rsp ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .axi_req_i (axi_csr_req ), + .axi_rsp_o (axi_csr_rsp ), + .reg_req_o (reg_req ), + .reg_rsp_i (reg_rsp ) ); diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index 32b3c46..30a4285 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -139,13 +139,10 @@ module cachepool_group /// AXI Narrow out-port (UART/Peripheral) output axi_narrow_req_t [GroupNarrowAxiPorts-1:0] axi_narrow_req_o, input axi_narrow_resp_t [GroupNarrowAxiPorts-1:0] axi_narrow_rsp_i, - /// Wide AXI ports to cluster level (BootROM only, one per tile) - output axi_out_req_t [NumTiles-1:0] axi_wide_req_o, - input axi_out_resp_t [NumTiles-1:0] axi_wide_rsp_i, /// DRAM refill reqrsp ports (post-xbar, one per L2 channel) - output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, - input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, + output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, + input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, /// Peripheral signals output icache_events_t [NrCores-1:0] icache_events_o, @@ -153,10 +150,10 @@ module cachepool_group input logic [NrCores-1:0] cl_interrupt_i, input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, input logic [3:0] l1d_private_i, - input cache_insn_t l1d_insn_i, + input cache_insn_t l1d_insn_i, input logic l1d_insn_valid_i, - output logic [NumTiles-1:0] l1d_insn_ready_o, - input logic [NumTiles-1:0] l1d_busy_i, + output logic [NumTiles-1:0] l1d_insn_ready_o, + input logic [NumTiles-1:0] l1d_busy_i, /// SRAM Configuration input impl_in_t [NrSramCfg-1:0] impl_i, @@ -218,19 +215,93 @@ module cachepool_group assign error_o = |error; // Internal tile-side wide AXI: split into two flat arrays by port function - // BootROM (TileBootROM=0): goes to cluster output + // BootROM (TileBootROM=0): muxed into single shared bootrom in this group axi_mst_cache_req_t [NumTiles-1:0] axi_tile_bootrom_req; axi_mst_cache_resp_t [NumTiles-1:0] axi_tile_bootrom_rsp; // TileMem (TileMem=1): stays in group, fed into axi_to_reqrsp axi_mst_cache_req_t [NumTiles-1:0] axi_tile_mem_req; axi_mst_cache_resp_t [NumTiles-1:0] axi_tile_mem_rsp; - // BootROM ports routed directly to cluster output (one per tile) - for (genvar t = 0; t < NumTiles; t++) begin : gen_bootrom_passthrough - assign axi_wide_req_o[t] = axi_tile_bootrom_req[t]; - assign axi_tile_bootrom_rsp[t] = axi_wide_rsp_i[t]; + // Mux all per-tile BootROM AXI ports into a single bootrom instance + axi_bootrom_slv_req_t axi_bootrom_mux_req; + axi_bootrom_slv_resp_t axi_bootrom_mux_rsp; + + if (NumTiles > 1) begin : gen_bootrom_mux + axi_mux #( + .SlvAxiIDWidth ( WideIdWidthIn ), + .slv_aw_chan_t ( axi_mst_cache_aw_chan_t ), + .mst_aw_chan_t ( axi_bootrom_slv_aw_chan_t ), + .w_chan_t ( axi_mst_cache_w_chan_t ), + .slv_b_chan_t ( axi_mst_cache_b_chan_t ), + .mst_b_chan_t ( axi_bootrom_slv_b_chan_t ), + .slv_ar_chan_t ( axi_mst_cache_ar_chan_t ), + .mst_ar_chan_t ( axi_bootrom_slv_ar_chan_t ), + .slv_r_chan_t ( axi_mst_cache_r_chan_t ), + .mst_r_chan_t ( axi_bootrom_slv_r_chan_t ), + .slv_req_t ( axi_mst_cache_req_t ), + .slv_resp_t ( axi_mst_cache_resp_t ), + .mst_req_t ( axi_bootrom_slv_req_t ), + .mst_resp_t ( axi_bootrom_slv_resp_t ), + .NoSlvPorts ( NumTiles ), + .FallThrough ( 0 ), + .SpillAw ( XbarLatency[4] ), + .SpillW ( XbarLatency[3] ), + .SpillB ( XbarLatency[2] ), + .SpillAr ( XbarLatency[1] ), + .SpillR ( XbarLatency[0] ), + .MaxWTrans ( 2 ) + ) i_axi_bootrom_mux ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), + .slv_reqs_i ( axi_tile_bootrom_req ), + .slv_resps_o( axi_tile_bootrom_rsp ), + .mst_req_o ( axi_bootrom_mux_req ), + .mst_resp_i ( axi_bootrom_mux_rsp ) + ); + end else begin : gen_bootrom_connect + // NumTiles==1: direct connect, no ID widening needed + assign axi_bootrom_mux_req = axi_bootrom_slv_req_t'(axi_tile_bootrom_req[0]); + assign axi_tile_bootrom_rsp[0] = axi_mst_cache_resp_t'(axi_bootrom_mux_rsp); end + // Single BootROM instance shared across all tiles in the group + `REG_BUS_TYPEDEF_ALL(reg_bootrom, addr_t, data_cache_t, strb_cache_t) + reg_bootrom_req_t bootrom_reg_req; + reg_bootrom_rsp_t bootrom_reg_rsp; + + axi_to_reg #( + .ADDR_WIDTH ( AxiAddrWidth ), + .DATA_WIDTH ( AxiDataWidth ), + .AXI_MAX_WRITE_TXNS ( 1 ), + .AXI_MAX_READ_TXNS ( 1 ), + .DECOUPLE_W ( 0 ), + .ID_WIDTH ( BootRomAxiSlvIdWidth ), + .USER_WIDTH ( AxiUserWidth ), + .axi_req_t ( axi_bootrom_slv_req_t ), + .axi_rsp_t ( axi_bootrom_slv_resp_t ), + .reg_req_t ( reg_bootrom_req_t ), + .reg_rsp_t ( reg_bootrom_rsp_t ) + ) i_axi_to_reg_bootrom ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .testmode_i ( 1'b0 ), + .axi_req_i ( axi_bootrom_mux_req ), + .axi_rsp_o ( axi_bootrom_mux_rsp ), + .reg_req_o ( bootrom_reg_req ), + .reg_rsp_i ( bootrom_reg_rsp ) + ); + + bootrom i_bootrom ( + .clk_i ( clk_i ), + .req_i ( bootrom_reg_req.valid ), + .addr_i ( addr_t'(bootrom_reg_req.addr) ), + .rdata_o ( bootrom_reg_rsp.rdata ) + ); + + `FF(bootrom_reg_rsp.ready, bootrom_reg_req.valid, 1'b0) + assign bootrom_reg_rsp.error = 1'b0; + // Cache refill ports from tiles (NumL1CacheCtrl = NumCores total) cache_trans_req_t [NumL1CacheCtrl-1:0] cache_refill_req; cache_trans_rsp_t [NumL1CacheCtrl-1:0] cache_refill_rsp; @@ -449,7 +520,7 @@ module cachepool_group end // --------------------- - // Cluster (DRAM) xbar + // Refill (DRAM) xbar // --------------------- reqrsp_xbar #( .NumInp ( NumClusterMst*NumTiles ), @@ -459,7 +530,7 @@ module cachepool_group .ExtRspPrio ( Burst_Enable ), .tcdm_req_chan_t ( cache_trans_req_chan_t ), .tcdm_rsp_chan_t ( cache_trans_rsp_chan_t ) - ) i_cluster_xbar ( + ) i_refill_xbar ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .slv_req_i ( tile_req_chan ), @@ -634,8 +705,7 @@ module cachepool_group .clk_i ( clk_i ), .rst_ni ( rst_ni ), .impl_i ( impl_i ), - .error_o ( error[t] ), - // TODO: remove hardcode + .error_o ( error [t] ), .debug_req_i ( debug_req_i [t*NumCoresTile+:NumCoresTile] ), .meip_i ( meip_i [t*NumCoresTile+:NumCoresTile] ), .mtip_i ( mtip_i [t*NumCoresTile+:NumCoresTile] ), @@ -648,19 +718,19 @@ module cachepool_group .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), // Remote Access Ports - .remote_req_o ( tile_remote_out_req[t] ), - .remote_req_dst_o ( remote_out_sel_tile[t] ), - .remote_rsp_i ( tile_remote_out_rsp[t] ), + .remote_req_o ( tile_remote_out_req [t] ), + .remote_req_dst_o ( remote_out_sel_tile [t] ), + .remote_rsp_i ( tile_remote_out_rsp [t] ), .remote_rsp_ready_i ( tile_remote_out_ready[t] ), - .remote_req_i ( tile_remote_in_req [t] ), - .remote_rsp_o ( tile_remote_in_rsp [t] ), - .remote_rsp_ready_o ( tile_remote_in_ready[t] ), + .remote_req_i ( tile_remote_in_req [t] ), + .remote_rsp_o ( tile_remote_in_rsp [t] ), + .remote_rsp_ready_o ( tile_remote_in_ready [t] ), // Cache Refill Ports (now internal, connected to group-level xbar) - .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), - .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) - .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), - .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), + .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), + .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), // Peripherals .icache_events_o ( /* unused */ ), .icache_prefetch_enable_i ( icache_prefetch_enable_i ), diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index 737bc70..ef97cfe 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -192,6 +192,13 @@ package cachepool_pkg; // UART ID width, with an extra xbar localparam int unsigned SpatzAxiUartIdWidth = SpatzAxiNarrowIdWidth + $clog2(NumTiles); + // BootROM AXI ID width: wide data bus, muxed from NumTiles tile ports. + // The group's axi_mst_cache slave ID width = GroupAxiIdWidth + 1 + // (cluster passes WideIdWidthIn = SpatzAxiIdOutWidth - clog2(NumClusterMst) + // = ClusterAxiIdWidth + 1 - ClusterRouteIdWidth = GroupAxiIdWidth + 1). + // The mux master adds $clog2(NumTiles) bits on top. + localparam int unsigned BootRomAxiSlvIdWidth = GroupAxiIdWidth + 1 + $clog2(NumTiles); + /***** Tile Ports *****/ // We have three sets of AXI ports for each tile: // 1) Wide output bus for BootRom & L2 (from ICache) @@ -286,6 +293,8 @@ package cachepool_pkg; typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; + typedef logic [BootRomAxiSlvIdWidth-1:0] axi_bootrom_slv_id_t; + ////////////////// // TILE TYPES // ////////////////// @@ -424,9 +433,11 @@ package cachepool_pkg; `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(axi_uart, axi_addr_t, axi_uart_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(axi_csr_mst, axi_addr_t, axi_id_csr_mst_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(axi_csr_slv, axi_addr_t, axi_id_csr_slv_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(axi_uart, axi_addr_t, axi_uart_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(axi_csr_mst, axi_addr_t, axi_id_csr_mst_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(axi_csr_slv, axi_addr_t, axi_id_csr_slv_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // BootROM: wide data bus (same payload as cache), slv = post-mux (widened ID) + `AXI_TYPEDEF_ALL(axi_bootrom_slv, axi_addr_t, axi_bootrom_slv_id_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) /************************************************************** * FUNCTIONS diff --git a/sim/scripts/vsim_cluster.tcl b/sim/scripts/vsim_cluster.tcl index c8d66f3..e34f008 100644 --- a/sim/scripts/vsim_cluster.tcl +++ b/sim/scripts/vsim_cluster.tcl @@ -7,10 +7,6 @@ onerror {resume} set cluster_path $1 -add wave -noupdate -group Cluster -group xbar -group req_xbar ${cluster_path}/i_cluster_xbar/i_req_xbar/* -add wave -noupdate -group Cluster -group xbar -group rsp_xbar ${cluster_path}/i_cluster_xbar/i_rsp_xbar/* -add wave -noupdate -group Cluster -group xbar ${cluster_path}/i_cluster_xbar/* - add wave -noupdate -group Cluster -group CSR ${cluster_path}/i_cachepool_cluster_peripheral/* add wave -noupdate -group Cluster -group Internal ${cluster_path}/* diff --git a/sim/scripts/vsim_group.tcl b/sim/scripts/vsim_group.tcl index 8edb7e5..f247f7e 100644 --- a/sim/scripts/vsim_group.tcl +++ b/sim/scripts/vsim_group.tcl @@ -16,4 +16,8 @@ for {set p 0} {$p < $2} {incr p} { add wave -noupdate -group Group -group remote_xbar[$p] ${xbar_path}/* } +add wave -noupdate -group Group -group refill_xbar -group req_xbar ${group_path}/i_refill_xbar/i_req_xbar/* +add wave -noupdate -group Group -group refill_xbar -group rsp_xbar ${group_path}/i_refill_xbar/i_rsp_xbar/* + + add wave -noupdate -group Group -group Internal ${group_path}/* diff --git a/util/auto-benchmark/write_results.py b/util/auto-benchmark/write_results.py index 4d254fe..2035e38 100644 --- a/util/auto-benchmark/write_results.py +++ b/util/auto-benchmark/write_results.py @@ -18,7 +18,7 @@ def extract_uart_lines(input_file_path, output_file_path, config=None, kernel=No # Copy only lines containing '[UART]' for line in input_file: - if '[UART]' in line: + if '[UART]' in line or '[EOC]' in line: output_file.write(line) output_file.write("\n----------------------------------------\n") From e6a76635855a6738e6eb25fe9b03384d471074f3 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 4 May 2026 09:39:19 +0200 Subject: [PATCH 03/37] [Bender] Add FlooNoC as a dependency --- Bender.lock | 54 ++++++++++++++++++++++++++++++++++++++++++++--------- Bender.yml | 2 ++ Makefile | 24 +++++++++++++++++++++++- 3 files changed, 70 insertions(+), 10 deletions(-) diff --git a/Bender.lock b/Bender.lock index 9684f42..c5b55df 100644 --- a/Bender.lock +++ b/Bender.lock @@ -16,17 +16,24 @@ packages: - common_verification - tech_cells_generic axi_riscv_atomics: - revision: 97dcb14ef057cbe5bd70dda2060b5bb9e7e04c6d - version: 0.7.0 + revision: 97a1dd2ac643c276880420a0cf8eea697f228aa9 + version: 0.8.3 source: Git: https://github.com/pulp-platform/axi_riscv_atomics.git dependencies: - axi - common_cells - common_verification + axi_stream: + revision: 54891ff40455ca94a37641b9da4604647878cc07 + version: 0.1.1 + source: + Git: https://github.com/pulp-platform/axi_stream.git + dependencies: + - common_cells common_cells: - revision: 9afda9abb565971649c2aa0985639c096f351171 - version: 1.38.0 + revision: 9ca8a7655f741e7dd5736669a20a301325194c28 + version: 1.39.0 source: Git: https://github.com/pulp-platform/common_cells.git dependencies: @@ -45,8 +52,27 @@ packages: Git: https://github.com/pulp-platform/dram_rtl_sim.git dependencies: - axi + floo_noc: + revision: 97306733f33acbb646c7e403c03a674fc1404b44 + version: null + source: + Git: https://github.com/pulp-platform/FlooNoC.git + dependencies: + - axi + - axi_riscv_atomics + - common_cells + - common_verification + - floo_noc_pd + - fpnew + - idma + floo_noc_pd: + revision: null + version: null + source: + Path: hardware/deps/floo_noc/./pd + dependencies: [] fpnew: - revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 + revision: e5aa6a01b5bbe1675c3aa8872e1203413ded83d1 version: null source: Git: https://github.com/pulp-platform/cvfpu.git @@ -61,14 +87,16 @@ packages: dependencies: - common_cells idma: - revision: b31e8f019c657eff4126bc789f0336d403da6766 - version: 0.4.2 + revision: 28a36e5e07705549e59fc33db96ab681bc1ca88e + version: 0.6.5 source: Git: https://github.com/pulp-platform/iDMA.git dependencies: - axi + - axi_stream - common_cells - common_verification + - obi - register_interface insitu-cache: revision: fa761ddebc946f9b46509d84945bf41ee1a9ec49 @@ -79,6 +107,14 @@ packages: - axi - common_cells - register_interface + obi: + revision: 0155fc34e900c7c884e081c0a1114a247937ff69 + version: 0.1.7 + source: + Git: https://github.com/pulp-platform/obi.git + dependencies: + - common_cells + - common_verification register_interface: revision: 146501d80052b61475cdc333d3aab4cd769fd5dc version: 0.3.9 @@ -97,10 +133,10 @@ packages: - common_cells - tech_cells_generic spatz: - revision: ed25c78dd72d839db8141287f9516d78ee399b93 + revision: null version: null source: - Git: https://github.com/pulp-platform/spatz.git + Path: hardware/deps/spatz dependencies: - axi - axi_riscv_atomics diff --git a/Bender.yml b/Bender.yml index 45b01da..53cead9 100644 --- a/Bender.yml +++ b/Bender.yml @@ -17,6 +17,7 @@ dependencies: Insitu-Cache: { git: "https://github.com/pulp-platform/Insitu-Cache.git", rev: zexin/cachepool_dev } spatz: { git: "https://github.com/pulp-platform/spatz.git", rev: cachepool-32b } dram_rtl_sim: { git: "https://github.com/pulp-platform/dram_rtl_sim.git", rev: cachepool } + floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: main } workspace: checkout_dir: "./hardware/deps" @@ -45,6 +46,7 @@ sources: - hardware/src/cachepool_tile.sv # Level 3 - hardware/src/cachepool_group.sv + - hardware/src/cachepool_group_noc_wrapper.sv - hardware/src/cachepool_cluster.sv # Level 4 diff --git a/Makefile b/Makefile index 588cf0f..f407f05 100644 --- a/Makefile +++ b/Makefile @@ -163,6 +163,27 @@ $(BOOTROM_DIR)/bootrom.sv: $(BOOTROM_DIR)/bootrom.bin $(BOOTROM_DIR)/bootdata.cc ${PYTHON} $(SCRIPTS_DIR)/generate_bootrom.py \ $< -c $(HJSON_OUT) --output $@ +########### +# FlooNoC # +########### +FLOO_DIR ?= $(shell $(BENDER_INSTALL_DIR)/bender path floo_noc) +FLOO_GEN_OUTDIR ?= $(ROOT_DIR)/hardware/generated +FLOO_CFG ?= $(ROOT_DIR)/config/floonoc_cachepool_4g.yml +FLOO_SYS = $(subst .yml,,$(notdir $(FLOO_CFG))) +FLOO_NOC ?= $(addprefix $(FLOO_GEN_OUTDIR)/,$(subst .yml,_floo_noc.sv,$(notdir $(FLOO_CFG)))) + +$(info FLOO_DIR: $(FLOO_DIR)) + +# Generates the sources for FlooNoC +.PHONY: update-floonoc install-floogen clean-floonoc +install-floogen: + $(MAKE) -C $(FLOO_DIR) install-floogen + +update-floonoc: $(FLOO_NOC) +$(FLOO_NOC): install-floogen $(FLOO_CFG) + mkdir -p $(FLOO_GEN_OUTDIR) + floogen -c $(FLOO_CFG) -o $(FLOO_GEN_OUTDIR) --only-pkg + ########### # DramSys # ########### @@ -232,13 +253,13 @@ VLOG_FLAGS += -64 VLOG_DEFS = -DCACHEPOOL # Cluster configuration +VLOG_DEFS += -DNUM_GROUPS=$(num_groups) VLOG_DEFS += -DNUM_TILES=$(num_tiles) VLOG_DEFS += -DNUM_CORES=$(num_cores) VLOG_DEFS += -DDATA_WIDTH=$(data_width) VLOG_DEFS += -DADDR_WIDTH=$(addr_width) # Tile configuration -VLOG_DEFS += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) VLOG_DEFS += -DREFILL_DATA_WIDTH=$(refill_data_width) # L1 Data Cache @@ -259,6 +280,7 @@ VLOG_DEFS += -DSPATZ_NUM_IPU=$(spatz_num_ipu) VLOG_DEFS += -DSPATZ_MAX_TRANS=$(spatz_max_trans) VLOG_DEFS += -DSNITCH_MAX_TRANS=$(snitch_max_trans) VLOG_DEFS += -DREMOTE_PORT_PER_CORE=$(num_remote_ports_per_tile) +VLOG_DEFS += -DRG_PORT_PER_CORE=$(num_rg_ports_per_core) # AXI configuration VLOG_DEFS += -DAXI_USER_WIDTH=$(axi_user_width) From 684cd8d0bded3a03a90349fe4448d1ae01bbdfc4 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 4 May 2026 09:44:35 +0200 Subject: [PATCH 04/37] [SRC] Add multi-group support. Cross-group interconnection is WIP. --- config/cachepool_fpu_512.mk | 14 +- config/config.mk | 14 +- hardware/bootrom/bootdata.cc | 4 +- hardware/bootrom/bootdata_bootrom.cc | 4 +- hardware/bootrom/bootrom.bin | Bin 136 -> 136 bytes hardware/bootrom/bootrom.dump | 6 +- hardware/bootrom/bootrom.elf | Bin 5248 -> 5248 bytes hardware/bootrom/bootrom.sv | 4 +- hardware/src/cachepool_cluster.sv | 319 +++++++------- hardware/src/cachepool_group.sv | 447 +++++++++++++------- hardware/src/cachepool_group_noc_wrapper.sv | 248 +++++++++++ hardware/src/cachepool_pkg.sv | 53 ++- hardware/src/cachepool_tile.sv | 268 ++++++++++-- hardware/src/tcdm_cache_interco.sv | 277 ++++++++---- sim/scripts/vsim_core.tcl | 362 ++++++++-------- sim/scripts/vsim_wave.tcl | 60 ++- 16 files changed, 1422 insertions(+), 658 deletions(-) create mode 100644 hardware/src/cachepool_group_noc_wrapper.sv diff --git a/config/cachepool_fpu_512.mk b/config/cachepool_fpu_512.mk index 2e4c3ca..a9a5458 100644 --- a/config/cachepool_fpu_512.mk +++ b/config/cachepool_fpu_512.mk @@ -8,11 +8,14 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 4 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,16 +23,15 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 -num_remote_ports_per_tile ?= 2 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 diff --git a/config/config.mk b/config/config.mk index 9eee8cb..32ed0f3 100644 --- a/config/config.mk +++ b/config/config.mk @@ -26,13 +26,20 @@ include $(CACHEPOOL_DIR)/config/$(config).mk ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 1 + # Number of tiles -num_tiles ?= 1 +num_tiles_per_group ?= 4 +num_tiles = $(shell echo $$(( $(num_groups) * $(num_tiles_per_group)))) num_remote_ports_per_tile ?= 1 # Number of cores -num_cores ?= 4 +num_cores_per_tile ?= 4 +num_cores ?= $(shell echo $$(( $(num_tiles) * $(num_cores_per_tile)))) + +num_rg_ports_per_core ?= 0 # Core datawidth data_width ?= 32 @@ -45,9 +52,6 @@ addr_width ?= 32 ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 diff --git a/hardware/bootrom/bootdata.cc b/hardware/bootrom/bootdata.cc index f96b8ba..9703ee8 100644 --- a/hardware/bootrom/bootdata.cc +++ b/hardware/bootrom/bootdata.cc @@ -7,13 +7,13 @@ namespace sim { const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 16, + .core_count = 64, .hartid_base = 0, .tcdm_start = 0xbffff800, .tcdm_size = 0x800, .tcdm_offset = 0x0, .global_mem_start = 0x80000000, .global_mem_end = 0xa0000000, - .tile_count = 4}; + .tile_count = 16}; } // namespace sim diff --git a/hardware/bootrom/bootdata_bootrom.cc b/hardware/bootrom/bootdata_bootrom.cc index d578d55..2c18278 100644 --- a/hardware/bootrom/bootdata_bootrom.cc +++ b/hardware/bootrom/bootdata_bootrom.cc @@ -18,11 +18,11 @@ struct BootData { }; extern "C" const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 16, + .core_count = 64, .hartid_base = 0, .tcdm_start = 0xbffff800, .tcdm_size = 0x800, .tcdm_offset = 0x0, .global_mem_start = 0x80000000, .global_mem_end = 0xa0000000, - .tile_count = 4}; + .tile_count = 16}; diff --git a/hardware/bootrom/bootrom.bin b/hardware/bootrom/bootrom.bin index d4a9322a260b03f47a853755da9a0080adbec54e..01c26acb7246982415de0a8ab2aad76d2fb498d8 100755 GIT binary patch delta 16 XcmeBR>|mVW!ssy3Rhdy>Vx~L*CO8C3 delta 16 XcmeBR>|mVW!YDA&Rhf}xVx~L*Bv=Fw diff --git a/hardware/bootrom/bootrom.dump b/hardware/bootrom/bootrom.dump index dad90e3..3000779 100644 --- a/hardware/bootrom/bootrom.dump +++ b/hardware/bootrom/bootrom.dump @@ -1,5 +1,5 @@ -/scratch2/diyou/cachepool/ManyRVData/hardware/bootrom/bootrom.elf: file format elf32-littleriscv +/scratch2/diyou/cachepool/main/ManyRVData/hardware/bootrom/bootrom.elf: file format elf32-littleriscv Disassembly of section .text: @@ -29,7 +29,7 @@ Disassembly of section .rodata: 00001040 : 1040: 1000 .2byte 0x1000 1042: 0000 .2byte 0x0 - 1044: 0010 .2byte 0x10 + 1044: 0040 .2byte 0x40 1046: 0000 .2byte 0x0 1048: 0000 .2byte 0x0 104a: 0000 .2byte 0x0 @@ -44,7 +44,7 @@ Disassembly of section .rodata: 1062: a000 .2byte 0xa000 1064: 0000 .2byte 0x0 1066: 0000 .2byte 0x0 - 1068: 0004 .2byte 0x4 + 1068: 0010 .2byte 0x10 106a: 0000 .2byte 0x0 106c: 0000 .2byte 0x0 ... diff --git a/hardware/bootrom/bootrom.elf b/hardware/bootrom/bootrom.elf index 6b957508d8d33db4fdddc9983021e4b4651b5da4..dce1406ce0f47c856466c15eaaf183326b74fa7b 100755 GIT binary patch delta 32 ncmZqBY|z}`BEaY{*;PQ9QDAeXz#>LAkBVdqAK%S_LitPpm6Zs@ delta 32 ncmZqBY|z}`BETpx*;PQ9k!5qHz#>MrQj1E1G^fpiLitPpkfsQd diff --git a/hardware/bootrom/bootrom.sv b/hardware/bootrom/bootrom.sv index c3b8995..8bed1aa 100644 --- a/hardware/bootrom/bootrom.sv +++ b/hardware/bootrom/bootrom.sv @@ -21,9 +21,9 @@ module bootrom #( const logic [RomSize-1:0][DataWidth-1:0] mem = { 128'h00001040000010380000000000001038, - 128'h000000000000000400000000a0000000, + 128'h000000000000001000000000a0000000, 128'h00000000800000000000000000000800, - 128'hbffff800000000000000001000001000, + 128'hbffff800000000000000004000001000, 128'hffdff06f10500073000380670003a383, 128'h0203839301c383b30105ae0300c5a383, 128'h105000733047d07306c5a58300000597, diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index e09e599..ce2aef9 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -172,7 +172,11 @@ module cachepool_cluster localparam int unsigned NrNarrowMasters = 2; localparam int unsigned WideIdWidthOut = AxiIdWidthOut; - localparam int unsigned WideIdWidthIn = WideIdWidthOut - $clog2(NumClusterMst); + localparam int unsigned WideIdWidthIn = WideIdWidthOut - $clog2(NumClusterMst) - GroupMuxIdBits; + + // Pre-mux AXI ID width: per-group reqrsp_to_axi output. + // The multi-group axi_mux adds GroupMuxIdBits on top to reach WideIdWidthOut. + localparam int unsigned WideIdWidthPreMux = WideIdWidthOut - GroupMuxIdBits; // Cache XBar configuration struct localparam axi_pkg::xbar_cfg_t CacheXbarCfg = '{ @@ -201,8 +205,14 @@ module cachepool_cluster typedef logic [WideIdWidthOut-1:0] id_cache_slv_t; typedef logic [AxiUserWidth-1:0] user_cache_t; + // Pre-mux (per-group) AXI types: narrower ID, widened by axi_mux. + typedef logic [WideIdWidthPreMux-1:0] id_cache_premux_t; + `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) + // Post-mux AXI types (same as before — used for axi_cut and output). `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) + // Pre-mux AXI types (per-group reqrsp_to_axi output, input to axi_mux). + `AXI_TYPEDEF_ALL(axi_premux_cache, addr_t, id_cache_premux_t, data_cache_t, strb_cache_t, user_cache_t) `REG_BUS_TYPEDEF_ALL(reg_cache, addr_t, data_cache_t, strb_cache_t) @@ -218,8 +228,13 @@ module cachepool_cluster // Wire Definitions // ---------------- // 1. AXI + // Post-mux wide AXI (one per L2 channel, merged across groups). axi_slv_cache_req_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_req; axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_rsp; + // Per-group pre-mux wide AXI (per group, per L2 channel). + axi_premux_cache_req_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_premux_req; + axi_premux_cache_resp_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_premux_rsp; + // Narrow AXI per tile (UART + Periph). axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; @@ -235,16 +250,21 @@ module cachepool_cluster logic [NumTiles-1:0] l1d_insn_ready; logic [NumTiles-1:0] l1d_busy; + // Per-group error signals. + logic [NumGroups-1:0] group_error; + // --------------- - // CachePool Tile + // CachePool Group // --------------- - // l2 reqrsp ports from the group (one per L2 channel) - l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req; - l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp; + // Per-group L2 reqrsp ports (one per L2 channel per group). + l2_req_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] l2_req; + l2_rsp_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] l2_rsp; + + assign error_o = |group_error; - if (NumTiles > 1) begin : gen_group - cachepool_group #( + for (genvar g = 0; g < NumGroups; g++) begin : gen_group + cachepool_group_noc_wrapper #( .AxiAddrWidth ( AxiAddrWidth ), .AxiDataWidth ( AxiDataWidth ), .AxiIdWidthIn ( AxiIdWidthIn ), @@ -253,9 +273,9 @@ module cachepool_cluster .BootAddr ( BootAddr ), .UartAddr ( UartAddr ), .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NrCores ), + .NrCores ( NumCoreGroup ), .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), + .NrBanks ( NrBanks / NumGroups ), .ICacheLineWidth ( ICacheLineWidth ), .ICacheLineCount ( ICacheLineCount ), .ICacheSets ( ICacheSets ), @@ -272,7 +292,7 @@ module cachepool_cluster .axi_narrow_resp_t ( axi_narrow_resp_t ), .axi_out_req_t ( axi_mst_cache_req_t ), .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), + .Xdma ( Xdma[g*NumCoreGroup +: NumCoreGroup] ), .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), .DMAReqFifoDepth ( DMAReqFifoDepth ), .RegisterOffloadRsp ( RegisterOffloadRsp ), @@ -284,141 +304,118 @@ module cachepool_cluster .MaxMstTrans ( MaxMstTrans ), .MaxSlvTrans ( MaxSlvTrans ) ) i_group ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error_o ), - .debug_req_i ( debug_req_i ), - .meip_i ( meip_i ), - .mtip_i ( mtip_i ), - .msip_i ( msip_i ), - .hart_base_id_i ( hart_base_id_i ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .private_start_addr_i ( private_start_addr ), - .axi_narrow_req_o ( axi_out_req ), - .axi_narrow_rsp_i ( axi_out_resp ), + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( group_error[g] ), + .debug_req_i ( debug_req_i[g*NumCoreGroup +: NumCoreGroup] ), + .meip_i ( meip_i [g*NumCoreGroup +: NumCoreGroup] ), + .mtip_i ( mtip_i [g*NumCoreGroup +: NumCoreGroup] ), + .msip_i ( msip_i [g*NumCoreGroup +: NumCoreGroup] ), + .hart_base_id_i ( hart_base_id_i + 10'(g * NumCoreGroup) ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .private_start_addr_i ( private_start_addr ), + .axi_narrow_req_o ( axi_out_req [g*NumTilesPerGroup +: NumTilesPerGroup] ), + .axi_narrow_rsp_i ( axi_out_resp[g*NumTilesPerGroup +: NumTilesPerGroup] ), // DRAM refill reqrsp (post-xbar, one per L2 channel) - .l2_req_o ( l2_req ), - .l2_rsp_i ( l2_rsp ), + .l2_req_o ( l2_req[g] ), + .l2_rsp_i ( l2_rsp[g] ), // Peripherals - .icache_events_o ( icache_events ), - .icache_prefetch_enable_i ( icache_prefetch_enable ), - .cl_interrupt_i ( cl_interrupt ), - .dynamic_offset_i ( dynamic_offset ), - .l1d_private_i ( l1d_private ), - .l1d_insn_i ( l1d_insn ), - .l1d_insn_valid_i ( l1d_insn_valid ), - .l1d_insn_ready_o ( l1d_insn_ready ), - .l1d_busy_i ( l1d_busy ) - ); - - end else begin : gen_tile - // TODO: single-tile path not yet migrated to new refill/bootrom datapath. - // This branch is never elaborated in the current configuration (NumTiles > 1 always). - cachepool_tile #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NrCores ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .TileIDWidth ( 1 ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_tile ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error_o ), - .debug_req_i ( debug_req_i ), - .meip_i ( meip_i ), - .mtip_i ( mtip_i ), - .msip_i ( msip_i ), - .hart_base_id_i ( hart_base_id_i ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .tile_id_i ( '0 ), - .private_start_addr_i ( private_start_addr ), - .axi_out_req_o ( axi_out_req [0] ), - .axi_out_resp_i ( axi_out_resp [0] ), - .remote_req_o ( ), - .remote_req_dst_o ( ), - .remote_rsp_i ( '0 ), - .remote_rsp_ready_i ( '0 ), - .remote_req_i ( '0 ), - .remote_rsp_o ( ), - .remote_rsp_ready_o ( ), - .cache_refill_req_o ( ), - .cache_refill_rsp_i ( '0 ), - .axi_wide_req_o ( ), - .axi_wide_rsp_i ( '0 ), - .icache_events_o ( ), - .icache_prefetch_enable_i ( icache_prefetch_enable ), - .cl_interrupt_i ( cl_interrupt ), - .dynamic_offset_i ( dynamic_offset ), - .l1d_private_i ( l1d_private ), - .l1d_insn_i ( l1d_insn ), - .l1d_insn_valid_i ( l1d_insn_valid ), - .l1d_insn_ready_o ( l1d_insn_ready ), - .l1d_busy_i ( l1d_busy ) + .icache_events_o ( icache_events[g*NumCoreGroup +: NumCoreGroup] ), + .icache_prefetch_enable_i ( icache_prefetch_enable ), + .cl_interrupt_i ( cl_interrupt [g*NumCoreGroup +: NumCoreGroup] ), + .dynamic_offset_i ( dynamic_offset ), + .l1d_private_i ( l1d_private ), + .l1d_insn_i ( l1d_insn ), + .l1d_insn_valid_i ( l1d_insn_valid ), + .l1d_insn_ready_o ( l1d_insn_ready[g*NumTilesPerGroup +: NumTilesPerGroup]), + .l1d_busy_i ( l1d_busy [g*NumTilesPerGroup +: NumTilesPerGroup]) ); end // ------------- - // To Main Memory: reqrsp_to_axi + output cut, consuming group l2 reqrsp ports + // To Main Memory: reqrsp_to_axi per group, then axi_mux across groups // ------------- - for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch ++) begin : gen_output_axi - reqrsp_to_axi #( - .MaxTrans (NumSpatzOutstandingLoads*2 ), - .ID ('0 ), - .EnBurst (1 ), - .ShuffleId (1 ), - .UserWidth ($bits(refill_user_t) ), - .ReqUserFallThrough (1'b0 ), - .DataWidth (AxiDataWidth ), - .AxiUserWidth (AxiUserWidth ), - .reqrsp_req_t (l2_req_t ), - .reqrsp_rsp_t (l2_rsp_t ), - .axi_req_t (axi_slv_cache_req_t ), - .axi_rsp_t (axi_slv_cache_resp_t ) - ) i_reqrsp2axi ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .user_i (l2_req[ch].q.user ), - .reqrsp_req_i (l2_req[ch] ), - .reqrsp_rsp_o (l2_rsp[ch] ), - .axi_req_o (wide_axi_slv_req[ch] ), - .axi_rsp_i (wide_axi_slv_rsp[ch] ) - ); + + // Step 1: Per-group reqrsp_to_axi conversion. + for (genvar g = 0; g < NumGroups; g++) begin : gen_per_group_l2 + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_per_ch + reqrsp_to_axi #( + .MaxTrans ( NumSpatzOutstandingLoads*2 ), + .ID ( '0 ), + .EnBurst ( 1 ), + .ShuffleId ( 1 ), + .UserWidth ( $bits(refill_user_t) ), + .ReqUserFallThrough ( 1'b0 ), + .DataWidth ( AxiDataWidth ), + .AxiUserWidth ( AxiUserWidth ), + .reqrsp_req_t ( l2_req_t ), + .reqrsp_rsp_t ( l2_rsp_t ), + .axi_req_t ( axi_premux_cache_req_t ), + .axi_rsp_t ( axi_premux_cache_resp_t ) + ) i_reqrsp2axi ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .user_i ( l2_req[g][ch].q.user ), + .reqrsp_req_i ( l2_req[g][ch] ), + .reqrsp_rsp_o ( l2_rsp[g][ch] ), + .axi_req_o ( wide_axi_premux_req[g][ch] ), + .axi_rsp_i ( wide_axi_premux_rsp[g][ch] ) + ); + end + end + + // Step 2: Per-L2-channel axi_mux across groups. + if (NumGroups > 1) begin : gen_l2_group_mux + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_ch_mux + // Collect per-group inputs for this channel. + axi_premux_cache_req_t [NumGroups-1:0] l2_mux_slv_req; + axi_premux_cache_resp_t [NumGroups-1:0] l2_mux_slv_rsp; + + for (genvar g = 0; g < NumGroups; g++) begin : gen_l2_mux_connect + assign l2_mux_slv_req[g] = wide_axi_premux_req[g][ch]; + assign wide_axi_premux_rsp[g][ch] = l2_mux_slv_rsp[g]; + end + + axi_mux #( + .SlvAxiIDWidth ( WideIdWidthPreMux ), + .slv_aw_chan_t ( axi_premux_cache_aw_chan_t ), + .mst_aw_chan_t ( axi_slv_cache_aw_chan_t ), + .w_chan_t ( axi_slv_cache_w_chan_t ), + .slv_b_chan_t ( axi_premux_cache_b_chan_t ), + .mst_b_chan_t ( axi_slv_cache_b_chan_t ), + .slv_ar_chan_t ( axi_premux_cache_ar_chan_t ), + .mst_ar_chan_t ( axi_slv_cache_ar_chan_t ), + .slv_r_chan_t ( axi_premux_cache_r_chan_t ), + .mst_r_chan_t ( axi_slv_cache_r_chan_t ), + .slv_req_t ( axi_premux_cache_req_t ), + .slv_resp_t ( axi_premux_cache_resp_t ), + .mst_req_t ( axi_slv_cache_req_t ), + .mst_resp_t ( axi_slv_cache_resp_t ), + .NoSlvPorts ( NumGroups ), + .FallThrough ( 0 ), + .SpillAw ( XbarLatency[4] ), + .SpillW ( XbarLatency[3] ), + .SpillB ( XbarLatency[2] ), + .SpillAr ( XbarLatency[1] ), + .SpillR ( XbarLatency[0] ), + .MaxWTrans ( 2 ) + ) i_axi_l2_mux ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), + .slv_reqs_i ( l2_mux_slv_req ), + .slv_resps_o ( l2_mux_slv_rsp ), + .mst_req_o ( wide_axi_slv_req[ch] ), + .mst_resp_i ( wide_axi_slv_rsp[ch] ) + ); + end + end else begin : gen_l2_no_mux + // Single group: direct connection, no mux needed. + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_ch_direct + assign wide_axi_slv_req[ch] = wide_axi_premux_req[0][ch]; + assign wide_axi_premux_rsp[0][ch] = wide_axi_slv_rsp[ch]; + end end // Optionally decouple the external wide AXI master port. @@ -458,20 +455,20 @@ module cachepool_cluster axi_mux #( .SlvAxiIDWidth ( CsrAxiMstIdWidth ), - .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), // AW Channel Type, slave ports - .mst_aw_chan_t ( axi_uart_aw_chan_t ), // AW Channel Type, master port - .w_chan_t ( axi_uart_w_chan_t ), // W Channel Type, all ports - .slv_b_chan_t ( axi_csr_mst_b_chan_t ), // B Channel Type, slave ports - .mst_b_chan_t ( axi_uart_b_chan_t ), // B Channel Type, master port - .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), // AR Channel Type, slave ports - .mst_ar_chan_t ( axi_uart_ar_chan_t ), // AR Channel Type, master port - .slv_r_chan_t ( axi_csr_mst_r_chan_t ), // R Channel Type, slave ports - .mst_r_chan_t ( axi_uart_r_chan_t ), // R Channel Type, master port + .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), + .mst_aw_chan_t ( axi_uart_aw_chan_t ), + .w_chan_t ( axi_uart_w_chan_t ), + .slv_b_chan_t ( axi_csr_mst_b_chan_t ), + .mst_b_chan_t ( axi_uart_b_chan_t ), + .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), + .mst_ar_chan_t ( axi_uart_ar_chan_t ), + .slv_r_chan_t ( axi_csr_mst_r_chan_t ), + .mst_r_chan_t ( axi_uart_r_chan_t ), .slv_req_t ( axi_csr_mst_req_t ), .slv_resp_t ( axi_csr_mst_resp_t ), .mst_req_t ( axi_uart_req_t ), .mst_resp_t ( axi_uart_resp_t ), - .NoSlvPorts ( NumTiles ), // Number of Masters for the module + .NoSlvPorts ( NumTiles ), .FallThrough ( 0 ), .SpillAw ( XbarLatency[4] ), .SpillW ( XbarLatency[3] ), @@ -480,9 +477,9 @@ module cachepool_cluster .SpillR ( XbarLatency[0] ), .MaxWTrans ( 2 ) ) i_axi_uart_mux ( - .clk_i ( clk_i ), // Clock - .rst_ni ( rst_ni ), // Asynchronous reset active low - .test_i ( '0 ), // Test Mode enable + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), .slv_reqs_i ( axi_uart_mux_req ), .slv_resps_o ( axi_uart_mux_rsp ), .mst_req_o ( axi_narrow_req_o ), @@ -544,20 +541,20 @@ module cachepool_cluster axi_mux #( .SlvAxiIDWidth ( CsrAxiMstIdWidth ), - .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), // AW Channel Type, slave ports - .mst_aw_chan_t ( axi_csr_slv_aw_chan_t ), // AW Channel Type, master port - .w_chan_t ( axi_csr_slv_w_chan_t ), // W Channel Type, all ports - .slv_b_chan_t ( axi_csr_mst_b_chan_t ), // B Channel Type, slave ports - .mst_b_chan_t ( axi_csr_slv_b_chan_t ), // B Channel Type, master port - .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), // AR Channel Type, slave ports - .mst_ar_chan_t ( axi_csr_slv_ar_chan_t ), // AR Channel Type, master port - .slv_r_chan_t ( axi_csr_mst_r_chan_t ), // R Channel Type, slave ports - .mst_r_chan_t ( axi_csr_slv_r_chan_t ), // R Channel Type, master port + .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), + .mst_aw_chan_t ( axi_csr_slv_aw_chan_t ), + .w_chan_t ( axi_csr_slv_w_chan_t ), + .slv_b_chan_t ( axi_csr_mst_b_chan_t ), + .mst_b_chan_t ( axi_csr_slv_b_chan_t ), + .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), + .mst_ar_chan_t ( axi_csr_slv_ar_chan_t ), + .slv_r_chan_t ( axi_csr_mst_r_chan_t ), + .mst_r_chan_t ( axi_csr_slv_r_chan_t ), .slv_req_t ( axi_csr_mst_req_t ), .slv_resp_t ( axi_csr_mst_resp_t ), .mst_req_t ( axi_csr_slv_req_t ), .mst_resp_t ( axi_csr_slv_resp_t ), - .NoSlvPorts ( NumTiles + 1 ), // Number of Masters for the module + .NoSlvPorts ( NumTiles + 1 ), .FallThrough ( 0 ), .SpillAw ( XbarLatency[4] ), .SpillW ( XbarLatency[3] ), diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index 30a4285..5bf72af 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -107,7 +107,12 @@ module cachepool_group parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ - parameter int unsigned NrSramCfg = 1 + parameter int unsigned NrSramCfg = 1, + + localparam int unsigned TotRGPorts = (NumRemoteGroupPortCore == 0) ? 0 : + NumTilesPerGroup*NumRemoteGroupPortCore*NrTCDMPortsPerCore-1, + localparam int unsigned NumRemoteGroupPortTile = (NumRemoteGroupPortCore == 0) ? 1 : + NumRemoteGroupPortCore * NrTCDMPortsPerCore ) ( /// System clock. input logic clk_i, @@ -137,8 +142,8 @@ module cachepool_group /// Partitioning address input axi_addr_t private_start_addr_i, /// AXI Narrow out-port (UART/Peripheral) - output axi_narrow_req_t [GroupNarrowAxiPorts-1:0] axi_narrow_req_o, - input axi_narrow_resp_t [GroupNarrowAxiPorts-1:0] axi_narrow_rsp_i, + output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, + input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, /// DRAM refill reqrsp ports (post-xbar, one per L2 channel) output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, @@ -152,8 +157,21 @@ module cachepool_group input logic [3:0] l1d_private_i, input cache_insn_t l1d_insn_i, input logic l1d_insn_valid_i, - output logic [NumTiles-1:0] l1d_insn_ready_o, - input logic [NumTiles-1:0] l1d_busy_i, + output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, + input logic [NumTilesPerGroup-1:0] l1d_busy_i, + + /// Inter-group remote access ports (to other groups). + /// Layout: [NumTilesPerGroup-1:0][NumRemoteGroupPortTile-1:0] flattened to + /// [NumTilesPerGroup * NumRemoteGroupPortTile - 1 : 0]. + /// Per-tile flat index: j + r * NrTCDMPortsPerCore (j = interco instance, + /// r = inter-group slot within that instance). + /// NumRemoteGroupPortTile = NumRemoteGroupPortCore * NrTCDMPortsPerCore. + /// Uses REQRSP-style types with built-in ready and remote_group_user_t. + output remote_group_req_t [TotRGPorts:0] remote_group_req_o, + input remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_i, + /// Inter-group remote access ports (from other groups) + input remote_group_req_t [TotRGPorts:0] remote_group_req_i, + output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, /// SRAM Configuration input impl_in_t [NrSramCfg-1:0] impl_i, @@ -174,9 +192,16 @@ module cachepool_group localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); localparam int unsigned TileIDWidth = cf_math_pkg::idx_width(NumTiles); + // Per-group overrides of package-level constants that depend on NumTiles/NumCores. + localparam int unsigned NrCoresTileLocal = NrCores / NumTilesPerGroup; + localparam int unsigned NumL1CacheCtrlLocal = NrCores; + localparam int unsigned NumL1CtrlTileLocal = NumL1CacheCtrlLocal / NumTilesPerGroup; + // Enlarge the address width for Spatz due to cache localparam int unsigned TCDMAddrWidth = L1AddrWidth; + // Per-tile inter-group remote port count (across all interco instances). + // Core Request, SoC Request localparam int unsigned NrNarrowMasters = 2; @@ -211,38 +236,44 @@ module cachepool_group // CachePool Tile // --------------- - logic [NumTiles-1:0] error; + logic [NumTilesPerGroup-1:0] error; assign error_o = |error; // Internal tile-side wide AXI: split into two flat arrays by port function // BootROM (TileBootROM=0): muxed into single shared bootrom in this group - axi_mst_cache_req_t [NumTiles-1:0] axi_tile_bootrom_req; - axi_mst_cache_resp_t [NumTiles-1:0] axi_tile_bootrom_rsp; + axi_mst_cache_req_t [NumTilesPerGroup-1:0] axi_tile_bootrom_req; + axi_mst_cache_resp_t [NumTilesPerGroup-1:0] axi_tile_bootrom_rsp; // TileMem (TileMem=1): stays in group, fed into axi_to_reqrsp - axi_mst_cache_req_t [NumTiles-1:0] axi_tile_mem_req; - axi_mst_cache_resp_t [NumTiles-1:0] axi_tile_mem_rsp; + axi_mst_cache_req_t [NumTilesPerGroup-1:0] axi_tile_mem_req; + axi_mst_cache_resp_t [NumTilesPerGroup-1:0] axi_tile_mem_rsp; + + // Per-group bootrom mux AXI type: the mux prepends $clog2(NumTilesPerGroup) + // bits to the ID, not $clog2(NumTiles) as the package assumes. + localparam int unsigned LocalBootRomIdWidth = WideIdWidthIn + $clog2(NumTilesPerGroup); + typedef logic [LocalBootRomIdWidth-1:0] local_bootrom_id_t; + `AXI_TYPEDEF_ALL(local_bootrom, addr_t, local_bootrom_id_t, data_cache_t, strb_cache_t, user_cache_t) // Mux all per-tile BootROM AXI ports into a single bootrom instance - axi_bootrom_slv_req_t axi_bootrom_mux_req; - axi_bootrom_slv_resp_t axi_bootrom_mux_rsp; + local_bootrom_req_t axi_bootrom_mux_req; + local_bootrom_resp_t axi_bootrom_mux_rsp; - if (NumTiles > 1) begin : gen_bootrom_mux + if (NumTilesPerGroup > 1) begin : gen_bootrom_mux axi_mux #( .SlvAxiIDWidth ( WideIdWidthIn ), .slv_aw_chan_t ( axi_mst_cache_aw_chan_t ), - .mst_aw_chan_t ( axi_bootrom_slv_aw_chan_t ), + .mst_aw_chan_t ( local_bootrom_aw_chan_t ), .w_chan_t ( axi_mst_cache_w_chan_t ), .slv_b_chan_t ( axi_mst_cache_b_chan_t ), - .mst_b_chan_t ( axi_bootrom_slv_b_chan_t ), + .mst_b_chan_t ( local_bootrom_b_chan_t ), .slv_ar_chan_t ( axi_mst_cache_ar_chan_t ), - .mst_ar_chan_t ( axi_bootrom_slv_ar_chan_t ), + .mst_ar_chan_t ( local_bootrom_ar_chan_t ), .slv_r_chan_t ( axi_mst_cache_r_chan_t ), - .mst_r_chan_t ( axi_bootrom_slv_r_chan_t ), + .mst_r_chan_t ( local_bootrom_r_chan_t ), .slv_req_t ( axi_mst_cache_req_t ), .slv_resp_t ( axi_mst_cache_resp_t ), - .mst_req_t ( axi_bootrom_slv_req_t ), - .mst_resp_t ( axi_bootrom_slv_resp_t ), - .NoSlvPorts ( NumTiles ), + .mst_req_t ( local_bootrom_req_t ), + .mst_resp_t ( local_bootrom_resp_t ), + .NoSlvPorts ( NumTilesPerGroup ), .FallThrough ( 0 ), .SpillAw ( XbarLatency[4] ), .SpillW ( XbarLatency[3] ), @@ -260,8 +291,8 @@ module cachepool_group .mst_resp_i ( axi_bootrom_mux_rsp ) ); end else begin : gen_bootrom_connect - // NumTiles==1: direct connect, no ID widening needed - assign axi_bootrom_mux_req = axi_bootrom_slv_req_t'(axi_tile_bootrom_req[0]); + // NumTilesPerGroup==1: direct connect, no ID widening needed + assign axi_bootrom_mux_req = local_bootrom_req_t'(axi_tile_bootrom_req[0]); assign axi_tile_bootrom_rsp[0] = axi_mst_cache_resp_t'(axi_bootrom_mux_rsp); end @@ -276,10 +307,10 @@ module cachepool_group .AXI_MAX_WRITE_TXNS ( 1 ), .AXI_MAX_READ_TXNS ( 1 ), .DECOUPLE_W ( 0 ), - .ID_WIDTH ( BootRomAxiSlvIdWidth ), + .ID_WIDTH ( LocalBootRomIdWidth ), .USER_WIDTH ( AxiUserWidth ), - .axi_req_t ( axi_bootrom_slv_req_t ), - .axi_rsp_t ( axi_bootrom_slv_resp_t ), + .axi_req_t ( local_bootrom_req_t ), + .axi_rsp_t ( local_bootrom_resp_t ), .reg_req_t ( reg_bootrom_req_t ), .reg_rsp_t ( reg_bootrom_rsp_t ) ) i_axi_to_reg_bootrom ( @@ -302,18 +333,18 @@ module cachepool_group `FF(bootrom_reg_rsp.ready, bootrom_reg_req.valid, 1'b0) assign bootrom_reg_rsp.error = 1'b0; - // Cache refill ports from tiles (NumL1CacheCtrl = NumCores total) - cache_trans_req_t [NumL1CacheCtrl-1:0] cache_refill_req; - cache_trans_rsp_t [NumL1CacheCtrl-1:0] cache_refill_rsp; + // Cache refill ports from tiles (NumL1CacheCtrlLocal = NumCores total) + cache_trans_req_t [NumL1CacheCtrlLocal-1:0] cache_refill_req; + cache_trans_rsp_t [NumL1CacheCtrlLocal-1:0] cache_refill_rsp; // cache_core_req/rsp: icache-bypass path, one per tile (from axi_to_reqrsp) - cache_trans_req_t [NumTiles-1:0] cache_core_req; - cache_trans_rsp_t [NumTiles-1:0] cache_core_rsp; + cache_trans_req_t [NumTilesPerGroup-1:0] cache_core_req; + cache_trans_rsp_t [NumTilesPerGroup-1:0] cache_core_rsp; - // Flat xbar input channels: NumTiles * NumClusterMst ports - cache_trans_req_chan_t [NumTiles*NumClusterMst-1:0] tile_req_chan; - cache_trans_rsp_chan_t [NumTiles*NumClusterMst-1:0] tile_rsp_chan; - logic [NumTiles*NumClusterMst-1:0] tile_req_valid, tile_req_ready, + // Flat xbar input channels: NumTilesPerGroup * NumClusterMst ports + cache_trans_req_chan_t [NumTilesPerGroup*NumClusterMst-1:0] tile_req_chan; + cache_trans_rsp_chan_t [NumTilesPerGroup*NumClusterMst-1:0] tile_rsp_chan; + logic [NumTilesPerGroup*NumClusterMst-1:0] tile_req_valid, tile_req_ready, tile_rsp_valid, tile_rsp_ready; // Xbar output channels: one per L2 channel @@ -323,18 +354,18 @@ module cachepool_group l2_rsp_valid, l2_rsp_ready; // Selection types - typedef logic [$clog2(NumClusterMst*NumTiles)-1:0] l2_sel_t; + typedef logic [$clog2(NumClusterMst*NumTilesPerGroup)-1:0] l2_sel_t; typedef logic [$clog2(ClusterWideOutAxiPorts) :0] tile_sel_err_t; // one extra bit for OOB typedef logic [$clog2(ClusterWideOutAxiPorts)-1:0] tile_sel_t; - tile_sel_err_t [NumTiles*NumClusterMst-1:0] tile_sel_err; - tile_sel_t [NumTiles*NumClusterMst-1:0] tile_sel; + tile_sel_err_t [NumTilesPerGroup*NumClusterMst-1:0] tile_sel_err; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] tile_sel; l2_sel_t [ClusterWideOutAxiPorts-1:0] tile_selected; l2_sel_t [ClusterWideOutAxiPorts-1:0] l2_sel; - tile_sel_t [NumTiles*NumClusterMst-1:0] l2_rsp_rr; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] l2_rsp_rr; - logic [NumTiles*NumClusterMst-1:0] rr_lock_d, rr_lock_q; - tile_sel_t [NumTiles*NumClusterMst-1:0] l2_prio_d, l2_prio_q; + logic [NumTilesPerGroup*NumClusterMst-1:0] rr_lock_d, rr_lock_q; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] l2_prio_d, l2_prio_q; // port_id: which xbar input port does each L2 channel response target l2_sel_t [ClusterWideOutAxiPorts-1:0] port_id; @@ -346,7 +377,7 @@ module cachepool_group // --------------------- // axi_to_reqrsp: TileMem (icache-bypass) path, one per tile // --------------------- - for (genvar t = 0; t < NumTiles; t++) begin : gen_axi_converter + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_axi_converter axi_to_reqrsp #( .axi_req_t ( axi_mst_cache_req_t ), .axi_rsp_t ( axi_mst_cache_resp_t ), @@ -375,7 +406,7 @@ module cachepool_group // p=1..NumL1CtrlTile -> refill (cache_refill_req) localparam int unsigned ReqrspPortsTile = NumL1CtrlTile + 1; always_comb begin - for (int t = 0; t < NumTiles; t++) begin + for (int t = 0; t < NumTilesPerGroup; t++) begin for (int p = 0; p < ReqrspPortsTile; p++) begin automatic int unsigned xbar_idx = t * ReqrspPortsTile + p; automatic int unsigned refill_idx = t * NumL1CtrlTile + p - 1; @@ -428,7 +459,7 @@ module cachepool_group logic [$clog2(ClusterWideOutAxiPorts):0] default_idx; assign default_idx = ClusterWideOutAxiPorts; - for (genvar inp = 0; inp < NumClusterMst*NumTiles; inp++) begin : gen_xbar_sel + for (genvar inp = 0; inp < NumClusterMst*NumTilesPerGroup; inp++) begin : gen_xbar_sel addr_decode_napot #( .NoIndices ( ClusterWideOutAxiPorts+1 ), .NoRules ( ClusterWideOutAxiPorts ), @@ -461,7 +492,7 @@ module cachepool_group `FF(rr_lock_q, rr_lock_d, 1'b0) `FF(l2_prio_q, l2_prio_d, 1'b0) - for (genvar port = 0; port < NumTiles*NumClusterMst; port++) begin : gen_rsp_rr + for (genvar port = 0; port < NumTilesPerGroup*NumClusterMst; port++) begin : gen_rsp_rr tile_sel_t l2_rr; logic [ClusterWideOutAxiPorts-1:0] arb_valid; @@ -523,7 +554,7 @@ module cachepool_group // Refill (DRAM) xbar // --------------------- reqrsp_xbar #( - .NumInp ( NumClusterMst*NumTiles ), + .NumInp ( NumClusterMst*NumTilesPerGroup ), .NumOut ( ClusterWideOutAxiPorts ), .PipeReg ( 1'b1 ), .ExtReqPrio ( 1'b0 ), @@ -539,7 +570,7 @@ module cachepool_group .slv_rsp_o ( tile_rsp_chan ), .slv_rsp_valid_o ( tile_rsp_valid ), .slv_rsp_ready_i ( tile_rsp_ready ), - .slv_sel_i ( tile_sel[NumTiles*NumClusterMst-1:0] ), + .slv_sel_i ( tile_sel[NumTilesPerGroup*NumClusterMst-1:0] ), .slv_rr_i ( '0 ), .slv_selected_o ( tile_selected ), .mst_req_o ( l2_req_chan ), @@ -591,31 +622,36 @@ module cachepool_group // Tile remote access signals // In/Out relative to the tile (out--leave a tile; in--enter a tile) // Tile-side flat layout: index = j + r*NrTCDMPortsPerCore (j=xbar idx, r=remote slot within xbar) - tcdm_req_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_out_req; - tcdm_rsp_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_out_rsp; - logic [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_ready, tile_remote_out_ready; + tcdm_req_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_out_req; + tcdm_rsp_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_out_rsp; + logic [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_ready, tile_remote_out_ready; - tcdm_req_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_req; - tcdm_rsp_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_rsp; + tcdm_req_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_req; + tcdm_rsp_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_rsp; - // Xbar-side: NrTCDMPortsPerCore xbars, each with NumTiles*NumRemotePortCore ports + // Xbar-side: NrTCDMPortsPerCore xbars, each with NumTilesPerGroup*NumRemotePortCore ports // Xbar port index = t*NumRemotePortCore + r - tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_req_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_req_valid, tile_remote_out_req_ready; - tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_rsp_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_rsp_valid, tile_remote_out_rsp_ready; + tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_req_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_req_valid, tile_remote_out_req_ready; + tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_rsp_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_rsp_valid, tile_remote_out_rsp_ready; + + tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_req_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_req_valid, tile_remote_in_req_ready; + tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_rsp_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_rsp_valid, tile_remote_in_rsp_ready; - tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_req_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_req_valid, tile_remote_in_req_ready; - tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_rsp_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_rsp_valid, tile_remote_in_rsp_ready; + // Per-group override of package-level remote xbar selection width. + // The package uses NumTiles (total), but the group's xbar is sized per-group. + localparam int unsigned LocalRemoteXbarSelWidth = $clog2(NumTilesPerGroup * NumRemotePortCore); + typedef logic [LocalRemoteXbarSelWidth-1:0] local_remote_xbar_sel_t; // Tile-side selection: narrow type, only carries tile_id - remote_tile_sel_t [NumTiles-1:0][NumRemotePortTile-1:0] remote_out_sel_tile; + remote_tile_sel_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] remote_out_sel_tile; // Xbar-side selection: wider type, encodes tile_id*NumRemotePortCore + core_id%NumRemotePortCore - remote_xbar_sel_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] remote_out_sel_xbar, remote_in_sel_xbar; + local_remote_xbar_sel_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] remote_out_sel_xbar, remote_in_sel_xbar; - for (genvar t = 0; t < NumTiles; t++) begin + for (genvar t = 0; t < NumTilesPerGroup; t++) begin for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin for (genvar r = 0; r < NumRemotePortCore; r++) begin // tile flat index: j + r*NrTCDMPortsPerCore @@ -636,112 +672,206 @@ module cachepool_group assign tile_remote_in_rsp_valid[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].p_valid; assign tile_remote_in_req_ready[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].q_ready; - // Request selection: route to target tile's remote-in slot based on - // target tile ID, so that all requests to the same destination tile - // travel through one pipeline — preserving write-before-read ordering. - assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'( + // Request selection: convert narrow tile_id to wide xbar index by appending + // core_id % NumRemotePortCore (available in the request channel user field) + assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = local_remote_xbar_sel_t'( remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] * NumRemotePortCore - + remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] % NumRemotePortCore); + + tile_remote_out_req_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); - // Response selection: route back to source tile's remote-out slot. - // The originator (tile_id in user field) sent on slot - // (target_tile % NumRemotePortCore). The responding tile is `t` - // (genvar), so target_tile = t. - assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'( + // Response selection: recover xbar port from tile_id and core_id in response user field + assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = local_remote_xbar_sel_t'( tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.tile_id * NumRemotePortCore - + t % NumRemotePortCore); + + tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); end end end - for (genvar t = 0; t < NumTiles; t ++) begin : gen_tiles + for (genvar t = 0; t < NumTilesPerGroup; t ++) begin : gen_tiles logic [9:0] hart_base_id; assign hart_base_id = hart_base_id_i + t * NumCoresTile; logic [TileIDWidth-1:0] tile_id; assign tile_id = t; - cachepool_tile #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NumCoresTile ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), - .TileIDWidth ( TileIDWidth ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_tile ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error [t] ), - .debug_req_i ( debug_req_i [t*NumCoresTile+:NumCoresTile] ), - .meip_i ( meip_i [t*NumCoresTile+:NumCoresTile] ), - .mtip_i ( mtip_i [t*NumCoresTile+:NumCoresTile] ), - .msip_i ( msip_i [t*NumCoresTile+:NumCoresTile] ), - .hart_base_id_i ( hart_base_id ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .tile_id_i ( tile_id ), - .private_start_addr_i ( private_start_addr_i ), - // AXI out for UART - .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), - .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), - // Remote Access Ports - .remote_req_o ( tile_remote_out_req [t] ), - .remote_req_dst_o ( remote_out_sel_tile [t] ), - .remote_rsp_i ( tile_remote_out_rsp [t] ), - .remote_rsp_ready_i ( tile_remote_out_ready[t] ), - .remote_req_i ( tile_remote_in_req [t] ), - .remote_rsp_o ( tile_remote_in_rsp [t] ), - .remote_rsp_ready_o ( tile_remote_in_ready [t] ), - // Cache Refill Ports (now internal, connected to group-level xbar) - .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), - .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), - // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) - .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), - .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), - // Peripherals - .icache_events_o ( /* unused */ ), - .icache_prefetch_enable_i ( icache_prefetch_enable_i ), - .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), - .dynamic_offset_i ( dynamic_offset_i ), - .l1d_insn_i ( l1d_insn_i ), - .l1d_private_i ( l1d_private_i ), - .l1d_insn_valid_i ( l1d_insn_valid_i ), - .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), - .l1d_busy_i ( l1d_busy_i [t] ) - ); + if (NumRemoteGroupPortCore == 0) begin : gen_tile + cachepool_tile #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoresTile ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .Xdma ( Xdma ), + .TileIDWidth ( TileIDWidth ), + .NumRemoteGroupPortCore ( NumRemoteGroupPortCore ), + .NumTilesPerGroup ( NumTilesPerGroup ), + .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), + .DMAReqFifoDepth ( DMAReqFifoDepth ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_tile ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( error [t] ), + .debug_req_i ( debug_req_i [t*NumCoresTile+:NumCoresTile] ), + .meip_i ( meip_i [t*NumCoresTile+:NumCoresTile] ), + .mtip_i ( mtip_i [t*NumCoresTile+:NumCoresTile] ), + .msip_i ( msip_i [t*NumCoresTile+:NumCoresTile] ), + .hart_base_id_i ( hart_base_id ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .tile_id_i ( tile_id ), + .private_start_addr_i ( private_start_addr_i ), + // AXI out for UART + .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + // Remote Access Ports + .remote_req_o ( tile_remote_out_req [t] ), + .remote_req_dst_o ( remote_out_sel_tile [t] ), + .remote_rsp_i ( tile_remote_out_rsp [t] ), + .remote_rsp_ready_i ( tile_remote_out_ready[t] ), + .remote_req_i ( tile_remote_in_req [t] ), + .remote_rsp_o ( tile_remote_in_rsp [t] ), + .remote_rsp_ready_o ( tile_remote_in_ready [t] ), + // Inter-group Remote Access Ports (directly exposed to group I/O) + .remote_group_req_o ( ), + .remote_group_rsp_i ( '0 ), + .remote_group_req_i ( '0 ), + .remote_group_rsp_o ( ), + // Cache Refill Ports (now internal, connected to group-level xbar) + .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), + // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) + .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), + .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), + .l1d_busy_i ( l1d_busy_i [t] ) + ); + end else begin : gen_tile + cachepool_tile #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoresTile ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .Xdma ( Xdma ), + .TileIDWidth ( TileIDWidth ), + .NumRemoteGroupPortCore ( NumRemoteGroupPortCore ), + .NumTilesPerGroup ( NumTilesPerGroup ), + .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), + .DMAReqFifoDepth ( DMAReqFifoDepth ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_tile ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( error [t] ), + .debug_req_i ( debug_req_i [t*NumCoresTile+:NumCoresTile] ), + .meip_i ( meip_i [t*NumCoresTile+:NumCoresTile] ), + .mtip_i ( mtip_i [t*NumCoresTile+:NumCoresTile] ), + .msip_i ( msip_i [t*NumCoresTile+:NumCoresTile] ), + .hart_base_id_i ( hart_base_id ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .tile_id_i ( tile_id ), + .private_start_addr_i ( private_start_addr_i ), + // AXI out for UART + .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + // Remote Access Ports + .remote_req_o ( tile_remote_out_req [t] ), + .remote_req_dst_o ( remote_out_sel_tile [t] ), + .remote_rsp_i ( tile_remote_out_rsp [t] ), + .remote_rsp_ready_i ( tile_remote_out_ready[t] ), + .remote_req_i ( tile_remote_in_req [t] ), + .remote_rsp_o ( tile_remote_in_rsp [t] ), + .remote_rsp_ready_o ( tile_remote_in_ready [t] ), + // Inter-group Remote Access Ports (directly exposed to group I/O) + .remote_group_req_o ( remote_group_req_o [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_rsp_i ( remote_group_rsp_i [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_req_i ( remote_group_req_i [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_rsp_o ( remote_group_rsp_o [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + // Cache Refill Ports (now internal, connected to group-level xbar) + .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), + // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) + .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), + .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), + .l1d_busy_i ( l1d_busy_i [t] ) + ); + end end // ------------ @@ -749,11 +879,10 @@ module cachepool_group // ------------ for (genvar p = 0; p < NrTCDMPortsPerCore; p++) begin : gen_remote_tile_xbar - // Decide which tile to go reqrsp_xbar #( - .NumInp (NumTiles * NumRemotePortCore ), - .NumOut (NumTiles * NumRemotePortCore ), + .NumInp (NumTilesPerGroup * NumRemotePortCore ), + .NumOut (NumTilesPerGroup * NumRemotePortCore ), .PipeReg (1'b1 ), .RspReg (1'b1 ), .ExtReqPrio (1'b0 ), diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv new file mode 100644 index 0000000..d64e3c0 --- /dev/null +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -0,0 +1,248 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Wrapper around cachepool_group that handles inter-group +// interconnection (mux/demux, flit packing, routers, receiving xbar). +// +// For now this is a pass-through wrapper with inter-group ports tied off, +// allowing the cluster to instantiate it in place of cachepool_group +// without functional change. The inter-group logic will be added +// incrementally. +// +// Author: Diyou Shen + +`include "axi/assign.svh" +`include "axi/typedef.svh" +`include "common_cells/assertions.svh" +`include "common_cells/registers.svh" +`include "mem_interface/assign.svh" +`include "mem_interface/typedef.svh" +`include "register_interface//assign.svh" +`include "register_interface/typedef.svh" +`include "reqrsp_interface/assign.svh" +`include "reqrsp_interface/typedef.svh" +`include "snitch_vm/typedef.svh" +`include "tcdm_interface/assign.svh" +`include "tcdm_interface/typedef.svh" + +module cachepool_group_noc_wrapper + import cachepool_pkg::*; + import spatz_pkg::*; + import fpnew_pkg::fpu_implementation_t; + import snitch_pma_pkg::snitch_pma_t; + import snitch_icache_pkg::icache_events_t; + #( + /// Width of physical address. + parameter int unsigned AxiAddrWidth = 48, + /// Width of AXI port. + parameter int unsigned AxiDataWidth = 512, + /// AXI: id width in. + parameter int unsigned AxiIdWidthIn = 2, + /// AXI: id width out. + parameter int unsigned AxiIdWidthOut = 2, + /// AXI: user width. + parameter int unsigned AxiUserWidth = 1, + /// Address from which to fetch the first instructions. + parameter logic [31:0] BootAddr = 32'h0, + /// Address to indicate start of UART + parameter logic [31:0] UartAddr = 32'h0, + /// The total amount of cores. + parameter int unsigned NrCores = 0, + /// Data/TCDM memory depth per cut (in words). + parameter int unsigned TCDMDepth = 1024, + /// Cluster peripheral address region size (in kB). + parameter int unsigned ClusterPeriphSize = 64, + /// Number of TCDM Banks. + parameter int unsigned NrBanks = 2 * NrCores, + /// Size of DMA AXI buffer. + parameter int unsigned DMAAxiReqFifoDepth = 3, + /// Size of DMA request FIFO. + parameter int unsigned DMAReqFifoDepth = 3, + /// Width of a single icache line. + parameter int unsigned ICacheLineWidth = 0, + /// Number of icache lines per set. + parameter int unsigned ICacheLineCount = 0, + /// Number of icache sets. + parameter int unsigned ICacheSets = 0, + /// Per-core enabling of the custom `Xdma` ISA extensions. + parameter bit [NrCores-1:0] Xdma = '{default: '0}, + /// FPU configuration. + parameter fpu_implementation_t FPUImplementation = fpu_implementation_t'(0), + /// Number of Spatz FPUs + parameter int unsigned NumSpatzFPUs = 1, + /// Number of Spatz IPUs + parameter int unsigned NumSpatzIPUs = 1, + /// Physical Memory Attributes Configuration + parameter snitch_pma_t SnitchPMACfg = '0, + /// # Outstanding loads + parameter int unsigned NumIntOutstandingLoads = 1, + parameter int unsigned NumIntOutstandingMem = 4, + parameter int unsigned NumSpatzOutstandingLoads = 4, + /// Insert Pipeline registers into off-loading path (roles) + parameter bit RegisterOffloadRsp = 1, + /// Insert Pipeline registers into data cache request path + parameter bit RegisterCoreReq = 0, + /// Insert Pipeline registers into data cache response path + parameter bit RegisterCoreRsp = 0, + /// Insert Pipeline registers after each memory cut + parameter bit RegisterTCDMCuts = 1'b0, + /// Decouple external AXI plug + parameter bit RegisterExt = 1'b0, + parameter axi_pkg::xbar_latency_e XbarLatency = axi_pkg::CUT_ALL_PORTS, + /// Outstanding transactions on the AXI network + parameter int unsigned MaxMstTrans = 4, + parameter int unsigned MaxSlvTrans = 4, + /// # Interface + /// AXI Ports + parameter type axi_in_req_t = logic, + parameter type axi_in_resp_t = logic, + parameter type axi_narrow_req_t = logic, + parameter type axi_narrow_resp_t = logic, + parameter type axi_out_req_t = logic, + parameter type axi_out_resp_t = logic, + /// SRAM configuration + parameter type impl_in_t = logic, + // Memory latency parameter. + parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, + /// # SRAM Configuration rules needed + parameter int unsigned NrSramCfg = 1 + ) ( + /// System clock. + input logic clk_i, + /// Asynchronous active high reset. + input logic rst_ni, + /// Per-core debug request signal. + input logic [NrCores-1:0] debug_req_i, + /// Machine external interrupt pending. + input logic [NrCores-1:0] meip_i, + /// Machine timer interrupt pending. + input logic [NrCores-1:0] mtip_i, + /// Core software interrupt pending. + input logic [NrCores-1:0] msip_i, + /// First hartid of the cluster. + input logic [9:0] hart_base_id_i, + /// Base address of cluster. + input axi_addr_t cluster_base_addr_i, + /// Partitioning address + input axi_addr_t private_start_addr_i, + /// AXI Narrow out-port (UART/Peripheral) + output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, + input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, + /// DRAM refill reqrsp ports (post-xbar, one per L2 channel) + output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, + input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, + /// Peripheral signals + output icache_events_t [NrCores-1:0] icache_events_o, + input logic icache_prefetch_enable_i, + input logic [NrCores-1:0] cl_interrupt_i, + input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, + input logic [3:0] l1d_private_i, + input cache_insn_t l1d_insn_i, + input logic l1d_insn_valid_i, + output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, + input logic [NumTilesPerGroup-1:0] l1d_busy_i, + /// SRAM Configuration + input impl_in_t [NrSramCfg-1:0] impl_i, + /// Indicate the program execution is error + output logic error_o + ); + + + // ------------------------------------------------------------------------- + // Inter-group remote signals + // ------------------------------------------------------------------------- + // Total per-group inter-group port count. + localparam int unsigned NumRemoteGroupPortTile = (NumRemoteGroupPortCore == 0) ? 1 + : NumRemoteGroupPortCore * NrTCDMPortsPerCore; + localparam int unsigned NumRemoteGroupPortGroup = NumRemoteGroupPortTile * NumTilesPerGroup; + + remote_group_req_t [NumRemoteGroupPortGroup-1:0] remote_group_req_to_group; + remote_group_rsp_t [NumRemoteGroupPortGroup-1:0] remote_group_rsp_from_group; + remote_group_req_t [NumRemoteGroupPortGroup-1:0] remote_group_req_from_group; + remote_group_rsp_t [NumRemoteGroupPortGroup-1:0] remote_group_rsp_to_group; + + // Tie off incoming inter-group requests: no traffic from other groups (for now). + assign remote_group_req_to_group = '0; + assign remote_group_rsp_to_group = '0; + + + + + // ------------------------------------------------------------------------- + // Group instantiation + // ------------------------------------------------------------------------- + cachepool_group #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( AxiIdWidthOut ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NrCores ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_out_req_t ), + .axi_out_resp_t ( axi_out_resp_t ), + .Xdma ( Xdma ), + .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), + .DMAReqFifoDepth ( DMAReqFifoDepth ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_group ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( error_o ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id_i ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .private_start_addr_i ( private_start_addr_i ), + .axi_narrow_req_o ( axi_narrow_req_o ), + .axi_narrow_rsp_i ( axi_narrow_rsp_i ), + // DRAM refill reqrsp (post-xbar, one per L2 channel) + .l2_req_o ( l2_req_o ), + .l2_rsp_i ( l2_rsp_i ), + // Inter-group remote ports (tied off for now) + .remote_group_req_o ( remote_group_req_from_group ), + .remote_group_rsp_i ( remote_group_rsp_to_group ), + .remote_group_req_i ( remote_group_req_to_group ), + .remote_group_rsp_o ( remote_group_rsp_from_group ), + // Peripherals + .icache_events_o ( icache_events_o ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o ), + .l1d_busy_i ( l1d_busy_i ) + ); + +endmodule diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index ef97cfe..07bdf0f 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -52,19 +52,32 @@ package cachepool_pkg; // TILE CONFIG // /////////////////// // How many cores for each tile? - localparam int unsigned NumCoresTile = NumCores / NumTiles; + localparam int unsigned NumCoresTile = NumCores / NumTiles; // How many remote ports for each tile per core's port? - localparam int unsigned NumRemotePortCore = `ifdef REMOTE_PORT_PER_CORE `REMOTE_PORT_PER_CORE `else 0 `endif; + localparam int unsigned NumRemotePortCore = `ifdef REMOTE_PORT_PER_CORE `REMOTE_PORT_PER_CORE `else 0 `endif; // How many cores within a tile? This is used to select the ports within a tile. - localparam int unsigned LogNumCoresTile = $clog2(NumCoresTile); + localparam int unsigned LogNumCoresTile = $clog2(NumCoresTile); // 4 ports from Spatz + 1 shared port from Snitch/FPU - localparam int unsigned NrTCDMPortsPerCore = 5; + localparam int unsigned NrTCDMPortsPerCore = 5; // How many remote ports for each tile in total? - localparam int unsigned NumRemotePortTile = NumRemotePortCore * NrTCDMPortsPerCore; + localparam int unsigned NumRemotePortTile = NumRemotePortCore * NrTCDMPortsPerCore; + + //////////////////// + // GROUP CONFIG // + //////////////////// + // How many tiles for each group? + localparam int unsigned NumTilesPerGroup = NumTiles / NumGroups; + + // How many cores for each group? + localparam int unsigned NumCoreGroup = NumCores / NumGroups; + + // How many remote group ports for each tile? + localparam int unsigned NumRemoteGroupPortCore = `ifdef RG_PORT_PER_CORE `RG_PORT_PER_CORE `else 0 `endif; + //////////////////// // CLUSTER HW // @@ -179,7 +192,12 @@ package cachepool_pkg; // legacy naming localparam int unsigned SpatzAxiIdInWidth = ClusterAxiIdWidth; // localparam int unsigned SpatzAxiIdInWidth = TileAxiIdWidth; - localparam int unsigned SpatzAxiIdOutWidth = ClusterAxiIdWidth + 1; + // Per-group AXI output ID width (pre multi-group mux). + localparam int unsigned GroupAxiIdOutWidth = ClusterAxiIdWidth + 1; + // Cluster-level AXI output ID width: widened by multi-group mux. + // When NumGroups == 1, $clog2(1) == 0 so this equals GroupAxiIdOutWidth. + localparam int unsigned GroupMuxIdBits = (NumGroups > 1) ? $clog2(NumGroups) : 0; + localparam int unsigned SpatzAxiIdOutWidth = GroupAxiIdOutWidth + GroupMuxIdBits; // Fixed AXI ID width for IWC localparam int unsigned IwcAxiIdOutWidth = SpatzAxiIdOutWidth + 1; @@ -281,6 +299,7 @@ package cachepool_pkg; typedef logic [SpatzAxiIdInWidth-1:0] axi_id_in_t; typedef logic [SpatzAxiIdOutWidth-1:0] axi_id_out_t; + typedef logic [GroupAxiIdOutWidth-1:0] axi_id_group_out_t; typedef logic [SpatzAxiNarrowIdWidth-1:0] axi_narrow_id_t; // legacy name; TODO: remove @@ -371,7 +390,23 @@ package cachepool_pkg; // GROUP TYPES // /////////////////// - typedef logic [RemoteXbarSelWidth-1:0] remote_xbar_sel_t; + typedef logic [RemoteXbarSelWidth-1:0] remote_xbar_sel_t; + typedef logic [$clog2(NrTCDMPortsPerCore)-1:0] portid_t; + + typedef struct packed { + // sender core within tile + logic [CoreIDWidth-1:0] core_id; + // sender tile (globally unique) + logic [TileIDWidth-1:0] tile_id; + // outstanding request ID + reqid_t req_id; + // FPU path indicator + logic is_fpu; + // interco instance index (for demux) + portid_t port_id; + } remote_group_user_t; + + `REQRSP_TYPEDEF_ALL(remote_group, narrow_addr_t, narrow_data_t, narrow_strb_t, remote_group_user_t) ///////////////////// @@ -430,7 +465,9 @@ package cachepool_pkg; // AXI typedef bundles `AXI_TYPEDEF_ALL(spatz_axi_narrow, axi_addr_t, axi_narrow_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(spatz_axi_in, axi_addr_t, axi_id_in_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Per-group AXI output: narrower ID (pre multi-group mux). + `AXI_TYPEDEF_ALL(spatz_axi_group_out, axi_addr_t, axi_id_group_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(axi_uart, axi_addr_t, axi_uart_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index e01c0ac..9369942 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -70,6 +70,12 @@ module cachepool_tile parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// Tile ID Width parameter int unsigned TileIDWidth = 0, + /// Number of dedicated inter-group remote ports per xbar plane. + /// When 0, no inter-group ports are generated (single-group mode). + parameter int unsigned NumRemoteGroupPortCore = 0, + /// Number of tiles within a single group (passed to interco for + /// group-id extraction from the address). + parameter int unsigned NumTilesPerGroup = 0, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = '0, @@ -110,7 +116,8 @@ module cachepool_tile parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ - parameter int unsigned NrSramCfg = 1 + parameter int unsigned NrSramCfg = 1, + localparam int unsigned TotRGPorts = (NumRemoteGroupPortCore == 0) ? 0 : NumRemoteGroupPortCore*NrTCDMPortsPerCore-1 ) ( /// System clock. input logic clk_i, @@ -161,6 +168,16 @@ module cachepool_tile input tcdm_req_t [NumRemotePortTile-1:0] remote_req_i, output tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_o, output logic [NumRemotePortTile-1:0] remote_rsp_ready_o, + /// Inter-group remote access ports (to other groups). + /// Flat layout: flat index = j + r * NrTCDMPortsPerCore, + /// where j is the interco instance and r is the inter-group remote slot. + /// Total count: NumRemoteGroupPortCore * NrTCDMPortsPerCore. + /// Uses REQRSP-style types with built-in ready and remote_group_user_t. + output remote_group_req_t [TotRGPorts:0] remote_group_req_o, + input remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_i, + /// Inter-group remote access ports (from other groups) + input remote_group_req_t [TotRGPorts:0] remote_group_req_i, + output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, /// Peripheral signals output icache_events_t [NrCores-1:0] icache_events_o, input logic icache_prefetch_enable_i, @@ -589,9 +606,145 @@ module cachepool_tile assign remote_rsp_ready_o = remote_out_pready; + // ------------------------------------------------------------------------- + // Inter-group remote ports – type conversion and flush protection + // ------------------------------------------------------------------------- + // External ports use REQRSP-style remote_group_req_t / remote_group_rsp_t + // (with built-in ready and remote_group_user_t). + // Internal interco uses TCDM-style tcdm_req_t / tcdm_rsp_t. + // This section bridges the two and applies flush gating. + // + // Same flat layout as remote ports: flat = j + r * NrTCDMPortsPerCore. + // Total count: NumRemoteGroupPortCore * NrTCDMPortsPerCore. + + localparam int unsigned NumRemoteGroupPortTile = NumRemoteGroupPortCore * NrTCDMPortsPerCore; + + // Internal TCDM-style signals going to/from the interco. + tcdm_req_t [NumRemoteGroupPortTile-1:0] rg_interco_in_req; // incoming requests to interco + tcdm_rsp_t [NumRemoteGroupPortTile-1:0] rg_interco_in_rsp; // responses from interco (for incoming) + logic [NumRemoteGroupPortTile-1:0] rg_interco_in_pready; // response ready for incoming + + tcdm_req_t [NumRemoteGroupPortTile-1:0] rg_interco_out_req; // outgoing requests from interco + tcdm_rsp_t [NumRemoteGroupPortTile-1:0] rg_interco_out_rsp; // responses returning (for outgoing) + logic [NumRemoteGroupPortTile-1:0] rg_interco_out_pready;// response ready for outgoing + remote_tile_sel_t [NumRemoteGroupPortTile-1:0] rg_interco_out_dst; // target tile from interco + + if (NumRemoteGroupPortCore > 0) begin : gen_remote_group_ports + always_comb begin + for (int j = 0; j < NrTCDMPortsPerCore; j++) begin + for (int r = 0; r < NumRemoteGroupPortCore; r++) begin + automatic int unsigned flat = j + r * NrTCDMPortsPerCore; + + // ----------------------------------------------------------- + // Incoming: REQRSP → TCDM conversion + flush gating → interco + // ----------------------------------------------------------- + rg_interco_in_req[flat] = '{ + q: '{ + addr: remote_group_req_i[flat].q.addr, + write: remote_group_req_i[flat].q.write, + data: remote_group_req_i[flat].q.data, + strb: remote_group_req_i[flat].q.strb, + amo: remote_group_req_i[flat].q.amo, + user: '{ + core_id: remote_group_req_i[flat].q.user.core_id, + tile_id: remote_group_req_i[flat].q.user.tile_id, + req_id: remote_group_req_i[flat].q.user.req_id, + is_fpu: remote_group_req_i[flat].q.user.is_fpu, + default: '0 + }, + default: '0 + }, + q_valid: remote_group_req_i[flat].q_valid && !l1d_busy_i, + default: '0 + }; + + // Interco response (TCDM) → REQRSP for remote_group_rsp_o. + remote_group_rsp_o[flat] = '{ + p: '{ + data: rg_interco_in_rsp[flat].p.data, + write: rg_interco_in_rsp[flat].p.write, + user: '{ + core_id: rg_interco_in_rsp[flat].p.user.core_id, + tile_id: rg_interco_in_rsp[flat].p.user.tile_id, + req_id: rg_interco_in_rsp[flat].p.user.req_id, + is_fpu: rg_interco_in_rsp[flat].p.user.is_fpu, + port_id: portid_t'(j), + default: '0 + }, + default: '0 + }, + p_valid: rg_interco_in_rsp[flat].p_valid, + q_ready: rg_interco_in_rsp[flat].q_ready && !l1d_busy_i, + default: '0 + }; + + // Response ready from the external port (REQRSP p_ready). + rg_interco_in_pready[flat] = remote_group_req_i[flat].p_ready && !l1d_busy_i; + + // ----------------------------------------------------------- + // Outgoing: interco → flush gating → TCDM to REQRSP → output + // ----------------------------------------------------------- + remote_group_req_o[flat] = '{ + q: '{ + addr: rg_interco_out_req[flat].q.addr, + write: rg_interco_out_req[flat].q.write, + data: rg_interco_out_req[flat].q.data, + strb: rg_interco_out_req[flat].q.strb, + amo: rg_interco_out_req[flat].q.amo, + user: '{ + core_id: rg_interco_out_req[flat].q.user.core_id, + tile_id: rg_interco_out_req[flat].q.user.tile_id, + req_id: rg_interco_out_req[flat].q.user.req_id, + is_fpu: rg_interco_out_req[flat].q.user.is_fpu, + port_id: portid_t'(j), + default: '0 + }, + default: '0 + }, + q_valid: rg_interco_out_req[flat].q_valid && !l1d_busy_i, + p_ready: rg_interco_out_pready[flat] && !l1d_busy_i, + default: '0 + }; + + // Returning response (REQRSP) → TCDM for the interco. + rg_interco_out_rsp[flat] = '{ + p: '{ + data: remote_group_rsp_i[flat].p.data, + write: remote_group_rsp_i[flat].p.write, + user: '{ + core_id: remote_group_rsp_i[flat].p.user.core_id, + tile_id: remote_group_rsp_i[flat].p.user.tile_id, + req_id: remote_group_rsp_i[flat].p.user.req_id, + is_fpu: remote_group_rsp_i[flat].p.user.is_fpu, + default: '0 + }, + default: '0 + }, + p_valid: remote_group_rsp_i[flat].p_valid, + q_ready: remote_group_rsp_i[flat].q_ready, + default: '0 + }; + end + end + end + end else begin : gen_remote_group_no_ports + // No inter-group remote ports: tie off outputs. + assign remote_group_rsp_o = '0; + assign remote_group_req_o = '0; + assign rg_interco_in_req = '0; + assign rg_interco_in_pready = '0; + assign rg_interco_out_rsp = '0; + assign rg_interco_out_pready = '0; + assign rg_interco_in_rsp = '0; + assign rg_interco_out_req = '0; + assign rg_interco_out_dst = '0; + end + /// Wire requests after strb handling to the cache controller. /// Each xbar j handles NumRemotePortCore remote slots at flat indices /// j + r*NrTCDMPortsPerCore for r in [0, NumRemotePortCore). + /// Similarly, each xbar j handles NumRemoteGroupPortCore inter-group remote slots at flat indices + /// j + r*NrTCDMPortsPerCore for r in [0, NumRemoteGroupPortCore). for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin : gen_cache_xbar // Collect the NumRemotePortCore remote slots for this xbar. tcdm_req_t [NumRemotePortCore-1:0] xbar_remote_req_gated; @@ -613,33 +766,92 @@ module cachepool_tile assign remote_req_o [flat] = xbar_remote_req_o [r]; end - tcdm_cache_interco #( - .NumTiles (NumTiles ), - .NumCores (NrCores ), - .NumCache (NumL1CtrlTile ), - .NumTotCache (NumL1CacheCtrl ), - .NumRemotePort (NumRemotePortCore ), - .AddrWidth (TCDMAddrWidth ), - .TileIDWidth (TileIDWidth ), - .tcdm_req_t (tcdm_req_t ), - .tcdm_rsp_t (tcdm_rsp_t ), - .tcdm_req_chan_t (tcdm_req_chan_t ), - .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) - ) i_cache_xbar ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .tile_id_i ( tile_id_i ), - .dynamic_offset_i ( dynamic_offset ), - .private_start_addr_i ( private_start_addr_i ), - .num_private_cache_i ( num_private_cache ), - .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), - .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), - .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), - .tile_sel_o ( xbar_remote_req_dst ), - .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]} ), - .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]} ), - .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]} ) - ); + // Collect the NumRemoteGroupPortCore inter-group remote slots for this xbar (same flat layout). + // When NumRemoteGroupPortCore == 0, no inter-group remote signals exist and the interco is + // instantiated without inter-group remote ports (backward-compatible). + if (NumRemoteGroupPortCore > 0) begin : gen_remote_group_slice + tcdm_req_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_req; + tcdm_rsp_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_rsp; + logic [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_pready; + tcdm_req_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_req; + tcdm_rsp_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_rsp; + logic [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_pready; + remote_tile_sel_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_dst; + + for (genvar r = 0; r < NumRemoteGroupPortCore; r++) begin : gen_remote_group_slice_r + localparam int unsigned flat = j + r * NrTCDMPortsPerCore; + // Incoming: from conversion/flush → interco input + assign xbar_remote_group_in_req [r] = rg_interco_in_req [flat]; + assign xbar_remote_group_in_pready [r] = rg_interco_in_pready [flat]; + assign rg_interco_in_rsp [flat] = xbar_remote_group_in_rsp [r]; + // Outgoing: interco output → conversion/flush + assign rg_interco_out_req [flat] = xbar_remote_group_out_req [r]; + assign rg_interco_out_dst [flat] = xbar_remote_group_out_dst [r]; + assign xbar_remote_group_out_rsp [r] = rg_interco_out_rsp [flat]; + assign rg_interco_out_pready [flat] = xbar_remote_group_out_pready[r]; + end + + tcdm_cache_interco #( + .NumTiles (NumTiles ), + .NumCores (NrCores ), + .NumCache (NumL1CtrlTile ), + .NumTotCache (NumL1CacheCtrl ), + .NumRemotePort (NumRemotePortCore ), + .NumRemoteGroupPort (NumRemoteGroupPortCore ), + .NumTilesPerGroup (NumTilesPerGroup ), + .AddrWidth (TCDMAddrWidth ), + .TileIDWidth (TileIDWidth ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) + ) i_cache_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tile_id_i ( tile_id_i ), + .dynamic_offset_i ( dynamic_offset ), + .private_start_addr_i ( private_start_addr_i ), + .num_private_cache_i ( num_private_cache ), + .core_req_i ({xbar_remote_group_in_req, xbar_remote_req_gated, cache_req [j]}), + .core_rsp_ready_i ({xbar_remote_group_in_pready, xbar_remote_in_pready, cache_pready [j]}), + .core_rsp_o ({xbar_remote_group_in_rsp, xbar_remote_rsp_xbar, cache_rsp [j]}), + .tile_sel_o ( xbar_remote_req_dst ), + .remote_group_sel_o ( xbar_remote_group_out_dst ), + .mem_req_o ({xbar_remote_group_out_req, xbar_remote_req_o, cache_xbar_req [j]}), + .mem_rsp_ready_o ({xbar_remote_group_out_pready, xbar_remote_out_pready, cache_xbar_pready[j]}), + .mem_rsp_i ({xbar_remote_group_out_rsp, xbar_remote_rsp_i, cache_xbar_rsp [j]}) + ); + end else begin : gen_no_remote_group + // No inter-group remote ports: instantiate interco without inter-group remote ports (backward-compatible). + tcdm_cache_interco #( + .NumTiles (NumTiles ), + .NumCores (NrCores ), + .NumCache (NumL1CtrlTile ), + .NumTotCache (NumL1CacheCtrl ), + .NumRemotePort (NumRemotePortCore ), + .AddrWidth (TCDMAddrWidth ), + .TileIDWidth (TileIDWidth ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) + ) i_cache_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tile_id_i ( tile_id_i ), + .dynamic_offset_i ( dynamic_offset ), + .private_start_addr_i ( private_start_addr_i ), + .num_private_cache_i ( num_private_cache ), + .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), + .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), + .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), + .tile_sel_o ( xbar_remote_req_dst ), + .remote_group_sel_o ( ), + .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]} ), + .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]} ), + .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]} ) + ); + end end for (genvar cb = 0; cb < NumL1CtrlTile; cb++) begin : gen_cache_connect diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv index ba49e11..f57397e 100644 --- a/hardware/src/tcdm_cache_interco.sv +++ b/hardware/src/tcdm_cache_interco.sv @@ -21,6 +21,21 @@ // private_bank = addr_bank_bits % num_private_cache_q // shared_bank = num_private_cache_q + (addr_bank_bits % num_shared_cache_q) // For non-power-of-2 partition sizes this causes uneven bank utilisation. +// +// Multi-group support (NumRemoteGroupPort > 0): +// +// When the cluster contains multiple groups, tile IDs are globally unique +// and encode both the group and tile-within-group: +// tile_id = {group_id, local_tile_id} +// +// The xbar performs three-way routing for shared (non-private) requests: +// 1. Local : same tile -> local cache bank +// 2. Intra-group : same group, diff tile -> remote port (existing xbar) +// 3. Inter-group : different group -> inter-group remote port (new) +// +// inter-group remote ports are appended after the remote ports on both input and output +// sides of the xbar, preserving full backward compatibility when +// NumRemoteGroupPort == 0. `include "common_cells/registers.svh" @@ -29,16 +44,31 @@ module tcdm_cache_interco #( parameter int unsigned NumTiles = 32'd1, /// Number of inputs into the interconnect (Cores per Tile) (`> 0`). parameter int unsigned NumCores = 32'd0, - /// Number of remote ports added to xbar ('>= 0'). + /// Number of remote ports added to xbar for intra-group traffic ('>= 0'). parameter int unsigned NumRemotePort = 32'd0, + /// Number of dedicated inter-group inter-group remote ports ('>= 0'). + /// When 0, the module behaves identically to the single-group configuration. + /// Each inter-group remote port serves as both an output (requests to other groups) and an + /// input (requests arriving from other groups), mirroring NumRemotePort. + parameter int unsigned NumRemoteGroupPort = 32'd0, /// Number of outputs from the interconnect (Cache banks per Tile) (`> 0`). parameter int unsigned NumCache = 32'd0, /// Number of total cache banks across all tiles (used for address scramble). + /// For multi-group, this must cover all tiles across all groups. parameter int unsigned NumTotCache = 32'd0, /// Address width in bits (cacheline offset: 512b => 6 bits). parameter int unsigned AddrWidth = 32'd32, /// Tile ID width ('> 0'). + /// In multi-group configurations, TileIDWidth covers the globally unique + /// tile ID which encodes both group and tile-within-group: + /// tile_id = {group_id, local_tile_id} parameter int unsigned TileIDWidth = 32'd1, + /// Number of tiles within a single group. + /// Used to extract the group portion from the address tile field: + /// group_id = addr_tile_bits / NumTilesPerGroup + /// Only relevant when NumRemoteGroupPort > 0. Defaults to NumTiles for + /// backward compatibility (single-group: all tiles are in one group). + parameter int unsigned NumTilesPerGroup = NumTiles, /// Port type of the data request ports. parameter type tcdm_req_t = logic, @@ -68,69 +98,82 @@ module tcdm_cache_interco #( input logic [$clog2(NumCache):0] num_private_cache_i, /// Partitioning address input addr_t private_start_addr_i, - /// Request port (cores + remote-in) ---------------------------------- - input tcdm_req_t [NumCores+NumRemotePort-1:0] core_req_i, + /// Request port (cores + intra-group remote-in + inter-group inter-group remote-in) ---- + input tcdm_req_t [NumCores+NumRemotePort+NumRemoteGroupPort-1:0] core_req_i, /// Response ready in. - input logic [NumCores+NumRemotePort-1:0] core_rsp_ready_i, - /// Response port (cores + remote-in). - output tcdm_rsp_t [NumCores+NumRemotePort-1:0] core_rsp_o, + input logic [NumCores+NumRemotePort+NumRemoteGroupPort-1:0] core_rsp_ready_i, + /// Response port (cores + intra-group remote-in + inter-group inter-group remote-in). + output tcdm_rsp_t [NumCores+NumRemotePort+NumRemoteGroupPort-1:0] core_rsp_o, /// Memory side ------------------------------------------------------- - /// Which remote tile is targeted (one entry per remote output port). + /// Which remote tile is targeted (one entry per intra-group remote output). output tile_id_t [NumRemotePort-1:0] tile_sel_o, - // output logic remote_group_o, - /// Requests to cache banks and remote output ports. - output tcdm_req_t [NumCache+NumRemotePort-1:0] mem_req_o, + /// Which tile is targeted via inter-group remote (one entry per inter-group remote output). + /// Carries the full globally-unique tile ID; the wrapper decomposes it + /// into group XY coordinates for the router and local tile ID for the + /// receiving-side xbar. + output tile_id_t [NumRemoteGroupPort-1:0] remote_group_sel_o, + /// Requests to cache banks, intra-group remote, and inter-group inter-group remote ports. + output tcdm_req_t [NumCache+NumRemotePort+NumRemoteGroupPort-1:0] mem_req_o, /// Response ready out. - output logic [NumCache+NumRemotePort-1:0] mem_rsp_ready_o, - /// Responses from cache banks and remote output ports. - input tcdm_rsp_t [NumCache+NumRemotePort-1:0] mem_rsp_i + output logic [NumCache+NumRemotePort+NumRemoteGroupPort-1:0] mem_rsp_ready_o, + /// Responses from cache banks, intra-group remote, and inter-group inter-group remote ports. + input tcdm_rsp_t [NumCache+NumRemotePort+NumRemoteGroupPort-1:0] mem_rsp_i ); // ------------------------------------------------------------------------- // Local parameters // ------------------------------------------------------------------------- - // Bits to index into xbar outputs (local banks + one remote slot). - localparam int unsigned NumOutSelBits = $clog2(NumCache + NumRemotePort); + // Total number of xbar input and output ports. + localparam int unsigned NumInp = NumCores + NumRemotePort + NumRemoteGroupPort; + localparam int unsigned NumOut = NumCache + NumRemotePort + NumRemoteGroupPort; + // Bits to index into xbar outputs. + localparam int unsigned NumOutSelBits = $clog2(NumOut); // Bits to index into xbar inputs. - localparam int unsigned NumInpSelBits = $clog2(NumCores + NumRemotePort); + localparam int unsigned NumInpSelBits = $clog2(NumInp); // Bits needed to select among local cache banks. - localparam int unsigned CacheBankBits = $clog2(NumCache); + localparam int unsigned CacheBankBits = $clog2(NumCache); // Bits needed to select the tile in the shared address space. - // Equals TileIDWidth by construction (NumTotCache / NumCache == NumTiles). - localparam int unsigned TileBits = $clog2(NumTotCache / NumCache); + // Equals TileIDWidth by construction (NumTotCache / NumCache == NumTotalTiles). + localparam int unsigned TileBits = $clog2(NumTotCache / NumCache); + + // Group extraction: number of bits to identify the group within TileID. + // GroupBits = TileBits - LocalTileBits, where LocalTileBits = $clog2(NumTilesPerGroup). + // Only meaningful when NumRemoteGroupPort > 0. + localparam int unsigned LocalTileBits = $clog2(NumTilesPerGroup); + localparam int unsigned GroupBits = TileBits - LocalTileBits; // ------------------------------------------------------------------------- // Types // ------------------------------------------------------------------------- typedef logic [NumInpSelBits-1:0] mem_sel_t; - typedef logic [NumOutSelBits -1:0] core_sel_t; + typedef logic [NumOutSelBits-1:0] core_sel_t; // ------------------------------------------------------------------------- // Internal signals // ------------------------------------------------------------------------- // Xbar routing signals. - core_sel_t [NumCores+NumRemotePort-1:0] core_req_sel; - mem_sel_t [NumCache+NumRemotePort-1:0] mem_rsp_sel; + core_sel_t [NumInp-1:0] core_req_sel; + mem_sel_t [NumOut-1:0] mem_rsp_sel; // '1' when this request stays on local banks. - logic [NumCores+NumRemotePort-1:0] local_sel; + logic [NumInp-1:0] local_sel; // '1' when a request targets the private partition. - logic [NumCores+NumRemotePort-1:0] is_private; + logic [NumInp-1:0] is_private; // Xbar channel signals. - tcdm_req_chan_t [NumCores+NumRemotePort-1:0] core_req; - logic [NumCores+NumRemotePort-1:0] core_req_valid, core_req_ready; + tcdm_req_chan_t [NumInp-1:0] core_req; + logic [NumInp-1:0] core_req_valid, core_req_ready; - tcdm_req_chan_t [NumCache+NumRemotePort-1:0] mem_req; - logic [NumCache+NumRemotePort-1:0] mem_req_valid, mem_req_ready; + tcdm_req_chan_t [NumOut-1:0] mem_req; + logic [NumOut-1:0] mem_req_valid, mem_req_ready; - tcdm_rsp_chan_t [NumCores+NumRemotePort-1:0] core_rsp; - logic [NumCores+NumRemotePort-1:0] core_rsp_valid, core_rsp_ready; + tcdm_rsp_chan_t [NumInp-1:0] core_rsp; + logic [NumInp-1:0] core_rsp_valid, core_rsp_ready; - tcdm_rsp_chan_t [NumCache+NumRemotePort-1:0] mem_rsp; - logic [NumCache+NumRemotePort-1:0] mem_rsp_valid, mem_rsp_ready; + tcdm_rsp_chan_t [NumOut-1:0] mem_rsp; + logic [NumOut-1:0] mem_rsp_valid, mem_rsp_ready; // ------------------------------------------------------------------------- // Partition control – registered to ease timing @@ -155,7 +198,7 @@ module tcdm_cache_interco #( // Private/shared classification (request side, before xbar) // ------------------------------------------------------------------------- - for (genvar inp = 0; inp < NumCores+NumRemotePort; inp++) begin : gen_is_private + for (genvar inp = 0; inp < NumInp; inp++) begin : gen_is_private assign is_private[inp] = (core_req[inp].addr >= private_start_addr_q); end @@ -164,8 +207,8 @@ module tcdm_cache_interco #( // ------------------------------------------------------------------------- reqrsp_xbar #( - .NumInp (NumCores + NumRemotePort), - .NumOut (NumCache + NumRemotePort), + .NumInp (NumInp ), + .NumOut (NumOut ), .PipeReg (1'b0 ), .ExtReqPrio (1'b0 ), .ExtRspPrio (1'b0 ), @@ -197,28 +240,43 @@ module tcdm_cache_interco #( // Request routing (xbar input-side selection) // ------------------------------------------------------------------------- // - // Address layout (example: offset=6, CacheBankBits=2, TileBits=2): + // Address layout (example: offset=6, CacheBankBits=2, TileBits=4 with + // LocalTileBits=2 and GroupBits=2): // - // 31 14 | 13 12 | 11 10 | 9 7 | 5 0 - // Tag | TileID | BankSel | Index | CL offset - // ^-- [offset+CacheBankBits+TileBits-1 : offset+CacheBankBits] - // ^-- [offset+CacheBankBits-1 : offset] + // 31 16 | 15 14 | 13 12 | 11 10 | 9 7 | 5 0 + // Tag | GroupID | LclTID | BankSel | Index | CL offset + // ^-- [offset+CacheBankBits+TileBits-1 : offset+CacheBankBits+LocalTileBits] + // ^-- [offset+CacheBankBits+LocalTileBits-1 : offset+CacheBankBits] + // ^-- [offset+CacheBankBits-1 : offset] // - // Partitioning supports any num_private_cache_q in [0..NumCache]: - // Private banks : ports [0 .. num_private_cache_q-1] - // Shared banks : ports [num_private_cache_q .. NumCache-1] + // Three-way routing classification: + // 1. Local : addr tile == my tile -> route to cache bank + // 2. Intra-group : same group, different tile -> route to remote port + // 3. Inter-group : different group -> route to inter-group remote port // - // Bank selection uses modulo folding: - // private_bank = (addr_bank_bits % num_private_cache_q) - // shared_bank = num_private_cache_q + (addr_bank_bits % num_shared_cache_q) + // Partitioning (private/shared) interacts as follows: + // - Private requests are always local (same as before). + // - Shared requests use the full three-way classification. // - // For power-of-2 partition sizes this reduces to a simple bit mask. - // For non-power-of-2 sizes (e.g. 3) the modulo is a small comparator since - // addr_bank_bits is only CacheBankBits wide. + // The original two-way classification (local vs. remote) is preserved + // when NumRemoteGroupPort == 0, ensuring backward compatibility. + + // Derive this tile's group ID from the globally-unique tile_id_i. + logic [TileBits-1:0] my_group_id; + if (NumRemoteGroupPort == 0) begin + assign my_group_id = tile_id_i; + end else begin + assign my_group_id = tile_id_i[TileBits-1:LocalTileBits]; + end - for (genvar port = 0; port < NumCores+NumRemotePort; port++) begin : gen_req_sel + for (genvar port = 0; port < NumInp; port++) begin : gen_req_sel logic [CacheBankBits-1:0] addr_bank; - logic [TileIDWidth-1:0] addr_tile; + // Full tile ID extracted from the address (covers group + local tile). + logic [TileBits-1:0] addr_tile_id; + // Group portion of the address tile field. + logic [TileBits-1:0] addr_group_id; + // Whether the addressed group matches this tile's group. + logic same_group; always_comb begin // Defaults. @@ -226,41 +284,63 @@ module tcdm_cache_interco #( core_req_sel[port] = '0; // Extract the raw BankSel field from the address. - addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits]; - // Extract the target TileID from the address (used for remote port selection). - addr_tile = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; - - if (num_private_cache_q == ($clog2(NumCache)+1)'(NumCache) || NumTiles == 1) begin - // All-private or single-tile: every request is local. + addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits]; + // Extract the full tile ID (group + local) from the address. + addr_tile_id = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; + // Extract group portion (upper bits of tile ID). + addr_group_id = addr_tile_id >> LocalTileBits; + // Compare group IDs. + same_group = (addr_group_id == my_group_id); + + if (num_private_cache_q == ($clog2(NumCache)+1)'(NumCache) + || (NumTiles == 1 && NumRemoteGroupPort == 0)) begin + // All-private, or single-tile single-group: every request is local. // Use the full BankSel field directly (no folding needed). local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank); end else if (num_private_cache_q == '0) begin - // All-shared: check TileID to decide local vs. remote. - // Use the full BankSel field directly (no folding needed). - local_sel[port] = (addr_tile == tile_id_i); - // Route remote requests by target tile ID so that all accesses to the - // same tile share a single pipeline, preserving write-before-read - // ordering across barriers. - core_req_sel[port] = local_sel[port] - ? core_sel_t'(addr_bank) - : core_sel_t'(NumCache + (addr_tile % NumRemotePort)); + // All-shared: full three-way classification. + if (NumRemoteGroupPort > 0 && !same_group) begin + // Inter-group: route to inter-group remote port. + local_sel[port] = 1'b0; + core_req_sel[port] = core_sel_t'(NumCache + NumRemotePort + + (port % NumRemoteGroupPort)); + end else if (addr_tile_id[LocalTileBits-1:0] != tile_id_i[LocalTileBits-1:0] + && !(NumTiles == 1)) begin + // Intra-group remote: different tile, same group. + local_sel[port] = 1'b0; + core_req_sel[port] = core_sel_t'(NumCache + (port % NumRemotePort)); + end else begin + // Local: same tile. + local_sel[port] = 1'b1; + core_req_sel[port] = core_sel_t'(addr_bank); + end end else begin - // Mixed: fold addr_bank into the appropriate partition via modulo. + // Mixed partition: fold addr_bank into the appropriate partition. if (is_private[port]) begin // Private request: always local. - // bank = addr_bank % num_private_cache_q, offset from bank 0. local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank % num_private_cache_q); end else begin - // Shared request: check TileID to decide local vs. remote. - // bank = num_private_cache_q + (addr_bank % num_shared_cache_q). - local_sel[port] = (addr_tile == tile_id_i); - core_req_sel[port] = local_sel[port] - ? core_sel_t'(num_private_cache_q + (addr_bank % num_shared_cache_q)) - : core_sel_t'(NumCache + (addr_tile % NumRemotePort)); + // Shared request: three-way classification. + if (NumRemoteGroupPort > 0 && !same_group) begin + // Inter-group: route to inter-group remote port. + local_sel[port] = 1'b0; + core_req_sel[port] = core_sel_t'(NumCache + NumRemotePort + + (port % NumRemoteGroupPort)); + end else if (addr_tile_id[LocalTileBits-1:0] != tile_id_i[LocalTileBits-1:0] + && !(NumTiles == 1)) begin + // Intra-group remote: different tile, same group. + local_sel[port] = 1'b0; + core_req_sel[port] = core_sel_t'(NumCache + (port % NumRemotePort)); + end else begin + // Local: same tile. + local_sel[port] = 1'b1; + core_req_sel[port] = core_sel_t'(num_private_cache_q + + (addr_bank % num_shared_cache_q)); + end end end end @@ -269,16 +349,35 @@ module tcdm_cache_interco #( // ------------------------------------------------------------------------- // Response routing (xbar output-side selection) // ------------------------------------------------------------------------- + // + // Responses from local cache banks are routed back to the originating + // core using core_id. Responses from intra-group remote tiles and + // inter-group inter-group remote ports carry a tile_id that differs from tile_id_i; + // these are forwarded to the corresponding remote-in or inter-group remote-in port. + + for (genvar port = 0; port < NumOut; port++) begin : gen_rsp_sel + logic [TileBits-1:0] rsp_group_id; + if (NumRemoteGroupPort == 0) begin + assign rsp_group_id = my_group_id; + end else begin + assign rsp_group_id = mem_rsp[port].user.tile_id[TileBits-1:LocalTileBits]; + end - for (genvar port = 0; port < NumCache+NumRemotePort; port++) begin : gen_rsp_sel always_comb begin mem_rsp_sel[port] = mem_rsp[port].user.core_id; if (mem_rsp[port].user.tile_id != tile_id_i) begin - // Response destined for a remote tile: forward to the remote interco - // port that matches the incoming request path. The group-level xbar - // routes requests from source tile S to our remote-in slot - // (S % NumRemotePort), so responses must return via the same slot. - mem_rsp_sel[port] = mem_sel_t'(NumCores + (mem_rsp[port].user.tile_id % NumRemotePort)); + // Response originates from a different tile (intra-group remote or + // inter-group remote). Determine which input port set it came from. + if (NumRemoteGroupPort > 0 + && rsp_group_id != my_group_id) begin + // Inter-group: forward to the inter-group remote-in input port. + mem_rsp_sel[port] = mem_sel_t'(NumCores + NumRemotePort + + (mem_rsp[port].user.core_id % NumRemoteGroupPort)); + end else begin + // Intra-group: forward to the remote-in input port. + mem_rsp_sel[port] = mem_sel_t'(NumCores + + (mem_rsp[port].user.core_id % NumRemotePort)); + end end end end @@ -287,7 +386,7 @@ module tcdm_cache_interco #( // Input-side pipeline registers // ------------------------------------------------------------------------- - for (genvar port = 0; port < NumCores+NumRemotePort; port++) begin : gen_cache_interco_reg + for (genvar port = 0; port < NumInp; port++) begin : gen_cache_interco_reg spill_register #( .T (tcdm_req_chan_t ) ) i_tcdm_req_reg ( @@ -349,11 +448,11 @@ module tcdm_cache_interco #( // // lower = addr & ((1 << offset) - 1) // CLoffset, verbatim // rot_field = (addr >> offset) & ((1 << N) - 1) // N routing bits - // upper = addr >> (offset + N) // Tag+Index + // upper = addr >> (offset + N) // Tag+Index // // addr_rot = lower - // | (upper << offset) // close the hole - // | (rot_field << (AddrWidth - N)) // park at MSB + // | (upper << offset) // close the hole + // | (rot_field << (AddrWidth - N)) // park at MSB // Width of bits_to_rotate signal: must hold values up to CacheBankBits+TileBits. localparam int unsigned RotWidth = $clog2(CacheBankBits + TileBits + 1) + 1; @@ -408,7 +507,7 @@ module tcdm_cache_interco #( // Output assignment // ------------------------------------------------------------------------- - for (genvar port = 0; port < NumCache + NumRemotePort; port++) begin : gen_cache_io + for (genvar port = 0; port < NumOut; port++) begin : gen_cache_io always_comb begin mem_req_o[port] = '{ q : mem_req[port], @@ -419,10 +518,14 @@ module tcdm_cache_interco #( if (port < NumCache) begin // Local bank: forward address with routing bits rotated to MSB. mem_req_o[port].q.addr = addr_rot[port]; - end else begin - // Remote port: pass address untouched; extract target tile ID. + end else if (port < NumCache + NumRemotePort) begin + // Intra-group remote port: pass address untouched; extract target tile ID. tile_sel_o[port - NumCache] = mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; + end else begin + // Inter-group inter-group remote port: pass address untouched; extract target tile ID. + remote_group_sel_o[port - NumCache - NumRemotePort] = + mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; end end diff --git a/sim/scripts/vsim_core.tcl b/sim/scripts/vsim_core.tcl index 9510e33..4021c22 100644 --- a/sim/scripts/vsim_core.tcl +++ b/sim/scripts/vsim_core.tcl @@ -4,179 +4,189 @@ # Create group for core $1 onerror {resume} - -set core_path ${3} - -add wave -noupdate -group tile[$1]_core[$2] -group scalar_xbar ${core_path}/i_cachepool_cc/i_scalar_xbar/* - -add wave -noupdate -group tile[$1]_core[$2] -group Params ${core_path}/i_cachepool_cc/BootAddr -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/clk_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/rst_i -add wave -noupdate -group tile[$1]_core[$2] -radix unsigned ${core_path}/i_cachepool_cc/i_snitch/hart_id_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Instructions -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_addr_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_data_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_valid_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_ready_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Load/Store -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/data_req_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/data_rsp_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Accelerator -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qreq_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qrsp_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qvalid_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qready_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_prsp_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_pvalid_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_pready_o - -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/illegal_inst -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/zero_lsb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -divider LSU -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_size -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_amo -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qvalid -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pvalid -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_load -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_i -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_acc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -divider ALU -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/iimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/jimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/bimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/simm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/adder_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs1 -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs2 -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_raddr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_rdata -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_waddr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_wdata -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_we -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/consec_pc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_load -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_store -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_signed -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_addr_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/st_addr_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/valid_instr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/exception -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_op -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/write_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uses_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/next_pc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_bypass -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_branch -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_rvalue -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_en -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_register_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/operands_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/dst_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_reversed -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_ext -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result_ext -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_arithmetic -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_writeback -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/core_events_o - -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -group Internal -group RF ${core_path}/i_cachepool_cc/i_snitch/i_snitch_regfile/* -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -group Internal ${core_path}/i_cachepool_cc/i_snitch/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_valid_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_ready_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_req_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_rsp_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_valid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_ready_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_valid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_ready_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_valid_i - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" -group FPR ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fpr/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" -group LSU ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fp_lsu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group Controller ${core_path}/i_cachepool_cc/i_spatz/i_controller/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider RegisterWrite -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wvalid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider RegisterRead -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/re_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rvalid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider Internal -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VSLDU ${core_path}/i_cachepool_cc/i_spatz/i_vsldu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VFU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group FPU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/gen_fpu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Internal ${core_path}/i_cachepool_cc/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU -group ROB0 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[0]/i_reorder_buffer/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU -group ROB1 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[1]/i_reorder_buffer/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_valid -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_ready -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_empty -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_pop -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_push -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo_bypass +quietly WaveActivateNextPane {} 0 + +set core_path ${4} +set name g_${1}_t_${2}_c_${3} + +# Safely handle the optional 5th argument for nesting +set parent_grp [list] +if {$argc > 4 && "${5}" != ""} { + set parent_grp [list -group ${5}] +} + +# The {*} syntax safely expands the list. +# If $parent_grp is empty, it safely ignores it instead of passing "". +add wave -noupdate {*}$parent_grp -group ${name} -group scalar_xbar ${core_path}/i_cachepool_cc/i_scalar_xbar/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Params ${core_path}/i_cachepool_cc/BootAddr +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/clk_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/rst_i +add wave -noupdate {*}$parent_grp -group ${name} -radix unsigned ${core_path}/i_cachepool_cc/i_snitch/hart_id_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Instructions +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_addr_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_data_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_valid_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_ready_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Load/Store +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/data_req_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/data_rsp_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Accelerator +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qreq_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qrsp_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qvalid_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qready_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_prsp_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_pvalid_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_pready_o + +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/illegal_inst +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/zero_lsb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -divider LSU +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_size +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_amo +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qvalid +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pvalid +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_load +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_i +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_acc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -divider ALU +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/iimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/jimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/bimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/simm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/adder_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs1 +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs2 +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_raddr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_rdata +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_waddr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_wdata +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_we +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/consec_pc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_load +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_store +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_signed +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_addr_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/st_addr_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/valid_instr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/exception +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_op +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/write_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uses_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/next_pc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_bypass +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_branch +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_rvalue +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_en +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_register_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/operands_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/dst_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_reversed +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_ext +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result_ext +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_arithmetic +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_writeback +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/core_events_o + +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -group Internal -group RF ${core_path}/i_cachepool_cc/i_snitch/i_snitch_regfile/* +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -group Internal ${core_path}/i_cachepool_cc/i_snitch/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_valid_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_ready_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_req_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_rsp_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_valid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_ready_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_valid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_ready_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_valid_i + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" -group FPR ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fpr/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" -group LSU ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fp_lsu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group Controller ${core_path}/i_cachepool_cc/i_spatz/i_controller/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider RegisterWrite +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wvalid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider RegisterRead +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/re_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rvalid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider Internal +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VSLDU ${core_path}/i_cachepool_cc/i_spatz/i_vsldu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VFU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group FPU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/gen_fpu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Internal ${core_path}/i_cachepool_cc/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU -group ROB0 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[0]/i_reorder_buffer/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU -group ROB1 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[1]/i_reorder_buffer/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_valid +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_empty +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_pop +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_push +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo_bypass diff --git a/sim/scripts/vsim_wave.tcl b/sim/scripts/vsim_wave.tcl index d5fa528..994f504 100644 --- a/sim/scripts/vsim_wave.tcl +++ b/sim/scripts/vsim_wave.tcl @@ -1,12 +1,14 @@ -# Copyright 2021 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 onerror {resume} quietly WaveActivateNextPane {} 0 +# --- Configuration Variables --- set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster -set group_path ${cluster_path}/gen_group/i_group +set NUM_GROUPS 4 ;# Change this variable to match your total number of groups +set NUM_CORES 4 ;# Assuming 4 cores per tile based on original script # Add the cluster probe add wave /tb_cachepool/cluster_probe @@ -14,23 +16,43 @@ add wave /tb_cachepool/cluster_probe # Cluster do sim/scripts/vsim_cluster.tcl ${cluster_path} -# Group -# add wave -noupdate -group Group ${group_path}/* -do sim/scripts/vsim_group.tcl ${group_path} 5 - -# Tile and Core -for {set tile 0} {$tile < 4} {incr tile} { - set tile_path ${group_path}/gen_tiles[$tile] - - do sim/scripts/vsim_tile.tcl $tile ${tile_path} - # Add all cores in Tile 0 - for {set core 0} {$core < 4} {incr core} { - set core_path ${tile_path}/i_tile/gen_core[$core] - do sim/scripts/vsim_core.tcl $tile $core ${core_path} - } - - for {set ch 0} {$ch < 4} {incr ch} { - add wave -noupdate -group DramSys$ch /tb_cachepool/gen_dram[$ch]/i_axi_dram_sim/* +# Iterate through all groups +for {set g 0} {$g < $NUM_GROUPS} {incr g} { + set group_wp_path ${cluster_path}/gen_group[$g]/i_group + set group_path ${group_wp_path}/i_group + + # 1. Plot all GroupWP levels of all groups + add wave -noupdate -group "GroupWP_$g" ${group_wp_path}/* + + do sim/scripts/vsim_group.tcl ${group_path} 5 + + # Conditional plotting based on the group + if {$g == 0} { + # 2. Call to plot tile 0 and tile 3 for Group 0 only + foreach tile {0 3} { + set tile_path ${group_path}/gen_tiles[$tile]/gen_tile + do sim/scripts/vsim_tile.tcl $tile ${tile_path} + + # 3. Plot all cores in the plotted tile + for {set core 0} {$core < $NUM_CORES} {incr core} { + set core_path ${tile_path}/i_tile/gen_core[$core] + # Pass an empty string to indicate NO parent group + do sim/scripts/vsim_core.tcl 0 $tile $core ${core_path} "" + } + } + } else { + # 4. Plot core 0 in tile 0 of other groups + set tile 0 + set core 0 + set tile_path ${group_path}/gen_tiles[$tile]/gen_tile + set core_path ${tile_path}/i_tile/gen_core[$core] + + # FIX: Use 'do' instead of 'source' and pass just the parent group name + do sim/scripts/vsim_core.tcl $g $tile $core ${core_path} "GroupWP_$g" } } +# Add DRAM waves once at the end +for {set ch 0} {$ch < 4} {incr ch} { + add wave -noupdate -group "DramSys_$ch" /tb_cachepool/gen_dram[$ch]/i_axi_dram_sim/* +} From 04e57ce8654eeeb11cb198c3ebe4d301449a7d5b Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 4 May 2026 11:08:09 +0200 Subject: [PATCH 05/37] [SW][Periph] Change the cache to all-private configuration by default to run test without intra-group interconnection. Adjust CI tests to check only runnable tests now. --- .../cachepool_peripheral/cachepool_peripheral.sv | 2 +- .../cachepool_peripheral_reg.hjson | 2 +- .../cachepool_peripheral_reg_top.sv | 2 +- software/tests/fmatmul-32b/main.c | 12 ++---------- util/auto-benchmark/check-ci.py | 2 +- util/auto-benchmark/configs-ci.sh | 3 ++- 6 files changed, 8 insertions(+), 15 deletions(-) diff --git a/hardware/cachepool_peripheral/cachepool_peripheral.sv b/hardware/cachepool_peripheral/cachepool_peripheral.sv index 6326cfa..c844ac2 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral.sv @@ -172,7 +172,7 @@ module cachepool_peripheral end `FF(private_start_addr_q, private_start_addr_d, 32'hA000_0000, clk_i, rst_ni) - `FF(l1d_private_q, l1d_private_d, '0, clk_i, rst_ni) + `FF(l1d_private_q, l1d_private_d, 4, clk_i, rst_ni) `FF(l1d_lock_q, l1d_lock_d, '0, clk_i, rst_ni) // To show if the current flush/invalidation is complete assign hw2reg.l1d_flush_status.d = (l1d_lock_q != '0); diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson b/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson index 79d7cda..068cce3 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson @@ -232,7 +232,7 @@ desc: '''Number of private banks configured per tile ''' swaccess: "rw", hwaccess: "hro", - resval: "0", + resval: "4", fields: [{ bits: "3:0", name: "NUMBER", diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv b/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv index c6ece73..adef765 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv @@ -517,7 +517,7 @@ module cachepool_peripheral_reg_top #( prim_subreg #( .DW (4), .SWACCESS("RW"), - .RESVAL (4'h0) + .RESVAL (4'h4) ) u_l1d_private ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/software/tests/fmatmul-32b/main.c b/software/tests/fmatmul-32b/main.c index abaf908..774b5fa 100644 --- a/software/tests/fmatmul-32b/main.c +++ b/software/tests/fmatmul-32b/main.c @@ -165,8 +165,8 @@ int main() { if (cid == 0) { for (uint32_t j = 0; j < num_cores; j++) { - printf("Core %d error %d\n", j, error[j]); - // error[0] += error[j]; + if (error[j] != 0) + printf("Core %d error %d\n", j, error[j]); } } else { @@ -174,14 +174,6 @@ int main() { } snrt_cluster_hw_barrier(); - - // if (error[0] != 0) { - // if (cid == 0) { - // printf("Check failed, error count:%d\n", error[0]); - // // printf("First iter took %u cycles\n", timer_iter1); - // } - // // return -1; - // } } } diff --git a/util/auto-benchmark/check-ci.py b/util/auto-benchmark/check-ci.py index fa4ef63..1bbf5e4 100644 --- a/util/auto-benchmark/check-ci.py +++ b/util/auto-benchmark/check-ci.py @@ -20,7 +20,7 @@ def main(): # Matches "error " anywhere in a line, captured as group 1. error_val_re = re.compile(r'\berror\s+(\d+)\b', re.IGNORECASE) # Matches FAIL or [FAIL] anywhere in a line. - fail_re = re.compile(r'\bFAIL\b', re.IGNORECASE) + fail_re = re.compile(r'\bFailed\b', re.IGNORECASE) failures = [] diff --git a/util/auto-benchmark/configs-ci.sh b/util/auto-benchmark/configs-ci.sh index 11fe23e..3659ad2 100644 --- a/util/auto-benchmark/configs-ci.sh +++ b/util/auto-benchmark/configs-ci.sh @@ -1,5 +1,6 @@ # Configs and kernel suffixes (without prefix) CONFIGS="cachepool_fpu_512" -KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv-opt_M512_N128_K32 fmatmul-32b_M32_N32_K32 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" +KERNELS="fdotp-32b_M32768 gemv-opt_M1024_N128_K32 fmatmul-32b_M64_N64_K64" +# KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv-opt_M512_N128_K32 fmatmul-32b_M32_N32_K32 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" PREFIX="test-cachepool-" # common prefix for all kernels ROOT_PATH=../.. # adjust if needed (path to repo root) From e0c78383bc034203b50ffd131697269088c0bb71 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 4 May 2026 11:41:03 +0200 Subject: [PATCH 06/37] [Bender] Fix a problem with Spatz's path in Bender.lock --- Bender.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Bender.lock b/Bender.lock index c5b55df..4f60875 100644 --- a/Bender.lock +++ b/Bender.lock @@ -133,10 +133,10 @@ packages: - common_cells - tech_cells_generic spatz: - revision: null + revision: ed25c78dd72d839db8141287f9516d78ee399b93 version: null source: - Path: hardware/deps/spatz + Git: https://github.com/pulp-platform/spatz.git dependencies: - axi - axi_riscv_atomics From de22ee9012c2defcf15a9335c0a51230d5f7fd3d Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Tue, 12 May 2026 09:11:08 +0200 Subject: [PATCH 07/37] [Runtime] Add missing functions to l1cache functions. --- software/snRuntime/include/l1cache.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/software/snRuntime/include/l1cache.h b/software/snRuntime/include/l1cache.h index ecde97b..ba6e9d2 100644 --- a/software/snRuntime/include/l1cache.h +++ b/software/snRuntime/include/l1cache.h @@ -17,6 +17,8 @@ void l1d_xbar_commit(); void l1d_commit(); void l1d_init(uint32_t size); void l1d_flush(); +void l1d_shared_flush(); +void l1d_private_flush(uint32_t tile); void l1d_wait(); void l1d_spm_config (uint32_t size); void l1d_part (uint32_t size); From b3d656f02a1f01d9b517908ad4ae70166298a6b5 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 14 May 2026 09:48:30 +0200 Subject: [PATCH 08/37] [SRC][NoC] Add NoC configurations --- Bender.yml | 2 + Makefile | 17 ++++--- config/floonoc_cachepool_4g.yml | 89 +++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 config/floonoc_cachepool_4g.yml diff --git a/Bender.yml b/Bender.yml index 53cead9..9c64d0d 100644 --- a/Bender.yml +++ b/Bender.yml @@ -30,6 +30,8 @@ sources: - hardware/src/tcdm_cache_interco.sv - hardware/src/tcdm_id_remapper.sv - hardware/src/spatz_cache_amo.sv + # FlooNoC + - hardware/generated/floo_cachepool_noc_pkg.sv # Memory-mapped register - hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv - hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv diff --git a/Makefile b/Makefile index f407f05..3264af7 100644 --- a/Makefile +++ b/Makefile @@ -166,23 +166,26 @@ $(BOOTROM_DIR)/bootrom.sv: $(BOOTROM_DIR)/bootrom.bin $(BOOTROM_DIR)/bootdata.cc ########### # FlooNoC # ########### -FLOO_DIR ?= $(shell $(BENDER_INSTALL_DIR)/bender path floo_noc) -FLOO_GEN_OUTDIR ?= $(ROOT_DIR)/hardware/generated -FLOO_CFG ?= $(ROOT_DIR)/config/floonoc_cachepool_4g.yml -FLOO_SYS = $(subst .yml,,$(notdir $(FLOO_CFG))) -FLOO_NOC ?= $(addprefix $(FLOO_GEN_OUTDIR)/,$(subst .yml,_floo_noc.sv,$(notdir $(FLOO_CFG)))) +FLOO_DIR ?= $(shell $(BENDER_INSTALL_DIR)/bender path floo_noc) +FLOO_GEN_OUTDIR ?= $(ROOT_DIR)/hardware/generated +FLOO_CFG ?= $(ROOT_DIR)/config/floonoc_cachepool_4g.yml +FLOO_NAME = cachepool +FLOO_NOC ?= $(FLOO_GEN_OUTDIR)/floo_$(FLOO_NAME)_noc_pkg.sv $(info FLOO_DIR: $(FLOO_DIR)) # Generates the sources for FlooNoC .PHONY: update-floonoc install-floogen clean-floonoc install-floogen: - $(MAKE) -C $(FLOO_DIR) install-floogen + pip install -e $(FLOO_DIR) --quiet update-floonoc: $(FLOO_NOC) $(FLOO_NOC): install-floogen $(FLOO_CFG) mkdir -p $(FLOO_GEN_OUTDIR) - floogen -c $(FLOO_CFG) -o $(FLOO_GEN_OUTDIR) --only-pkg + PATH="$(HOME)/.local/bin:$(PATH)" floogen pkg -c $(FLOO_CFG) -o $(FLOO_GEN_OUTDIR) --no-format + +clean-floonoc: + rm -f $(FLOO_NOC) ########### # DramSys # diff --git a/config/floonoc_cachepool_4g.yml b/config/floonoc_cachepool_4g.yml new file mode 100644 index 0000000..2c81e28 --- /dev/null +++ b/config/floonoc_cachepool_4g.yml @@ -0,0 +1,89 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +name: cachepool +description: "CachePool AXI NoC" +network_type: "axi" + +routing: + route_algo: "SRC" + use_id_table: true + +protocols: + - name: "wide_in" + type: "wide" + protocol: "AXI4" + data_width: 256 + addr_width: 32 + id_width: 2 + user_width: 1 + - name: "wide_out" + type: "wide" + protocol: "AXI4" + data_width: 256 + addr_width: 32 + id_width: 2 + user_width: 1 + +endpoints: + - name: "group" + array: [2, 2] + mgr_port_protocol: + - "wide_in" + - name: "hbm" + array: [4] + addr_range: + base: 0x8000_0000 + size: 0x0010_0000 + sbr_port_protocol: + - "wide_out" + - name: "host_peri" + addr_range: + - start: 0x0000_0000 + end: 0x7FFF_FFFF + - start: 0xA000_0000 + end: 0xC000_FFFF + mgr_port_protocol: + - "wide_in" + sbr_port_protocol: + - "wide_out" + +routers: + - name: "group_router" + array: [2, 2] + degree: 5 + +connections: + - src: "group" + dst: "group_router" + src_range: + - [0, 1] + - [0, 1] + dst_range: + - [0, 1] + - [0, 1] + dst_dir: "Eject" + # HBM West + - src: "hbm" + dst: "group_router" + src_range: + - [0, 1] + dst_range: + - [0, 0] + - [0, 1] + dst_dir: "West" + # HBM East + - src: "hbm" + dst: "group_router" + src_range: + - [2, 3] + dst_range: + - [1, 1] + - [0, 1] + dst_dir: "East" + # Special + - src: "host_peri" + dst: "group_router" + dst_idx: [0, 0] + dst_dir: "South" From 61332df40e8cbda952e2ccc9ebc460cda1c98e1f Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 14 May 2026 09:50:20 +0200 Subject: [PATCH 09/37] WIP: [SRC] Connect TCDM NoC --- config/cachepool_fpu_512.mk | 2 +- .../cachepool_peripheral.sv | 2 +- hardware/src/cachepool_cluster.sv | 116 +++- hardware/src/cachepool_group.sv | 9 +- hardware/src/cachepool_group_noc_wrapper.sv | 540 +++++++++++++++--- hardware/src/cachepool_pkg.sv | 59 +- hardware/src/cachepool_tile.sv | 13 +- hardware/src/tcdm_cache_interco.sv | 17 + 8 files changed, 667 insertions(+), 91 deletions(-) diff --git a/config/cachepool_fpu_512.mk b/config/cachepool_fpu_512.mk index a9a5458..89a0815 100644 --- a/config/cachepool_fpu_512.mk +++ b/config/cachepool_fpu_512.mk @@ -9,7 +9,7 @@ ######################### # Number of groups -num_groups ?= 4 +num_groups ?= 2 # Number of tiles num_tiles_per_group ?= 4 diff --git a/hardware/cachepool_peripheral/cachepool_peripheral.sv b/hardware/cachepool_peripheral/cachepool_peripheral.sv index c844ac2..f3ad93a 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral.sv @@ -172,7 +172,7 @@ module cachepool_peripheral end `FF(private_start_addr_q, private_start_addr_d, 32'hA000_0000, clk_i, rst_ni) - `FF(l1d_private_q, l1d_private_d, 4, clk_i, rst_ni) + `FF(l1d_private_q, l1d_private_d, 0, clk_i, rst_ni) `FF(l1d_lock_q, l1d_lock_d, '0, clk_i, rst_ni) // To show if the current flush/invalidation is complete assign hw2reg.l1d_flush_status.d = (l1d_lock_q != '0); diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index ce2aef9..78d84c3 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -253,6 +253,20 @@ module cachepool_cluster // Per-group error signals. logic [NumGroups-1:0] group_error; + // Inter-group NoC mesh signals (indexed by group, then direction, then port) + noc_group_req_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out_ready; + noc_group_req_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in_ready; + noc_group_rsp_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out_ready; + noc_group_rsp_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in_ready; + // --------------- // CachePool Group // --------------- @@ -313,6 +327,7 @@ module cachepool_cluster .mtip_i ( mtip_i [g*NumCoreGroup +: NumCoreGroup] ), .msip_i ( msip_i [g*NumCoreGroup +: NumCoreGroup] ), .hart_base_id_i ( hart_base_id_i + 10'(g * NumCoreGroup) ), + .tile_base_id_i ( TileIDWidth'(g * NumTilesPerGroup) ), .cluster_base_addr_i ( cluster_base_addr_i ), .private_start_addr_i ( private_start_addr ), .axi_narrow_req_o ( axi_out_req [g*NumTilesPerGroup +: NumTilesPerGroup] ), @@ -329,10 +344,109 @@ module cachepool_cluster .l1d_insn_i ( l1d_insn ), .l1d_insn_valid_i ( l1d_insn_valid ), .l1d_insn_ready_o ( l1d_insn_ready[g*NumTilesPerGroup +: NumTilesPerGroup]), - .l1d_busy_i ( l1d_busy [g*NumTilesPerGroup +: NumTilesPerGroup]) + .l1d_busy_i ( l1d_busy [g*NumTilesPerGroup +: NumTilesPerGroup]), + .group_xy_id_i ( group_xy_id_t'{x: g % NumGroupsX, + y: g / NumGroupsX, + port_id: 1'b0} ), + .noc_req_o ( noc_req_out [g] ), + .noc_req_valid_o ( noc_req_out_valid[g] ), + .noc_req_ready_i ( noc_req_out_ready[g] ), + .noc_req_i ( noc_req_in [g] ), + .noc_req_valid_i ( noc_req_in_valid [g] ), + .noc_req_ready_o ( noc_req_in_ready [g] ), + .noc_rsp_o ( noc_rsp_out [g] ), + .noc_rsp_valid_o ( noc_rsp_out_valid[g] ), + .noc_rsp_ready_i ( noc_rsp_out_ready[g] ), + .noc_rsp_i ( noc_rsp_in [g] ), + .noc_rsp_valid_i ( noc_rsp_in_valid [g] ), + .noc_rsp_ready_o ( noc_rsp_in_ready [g] ) ); end + // ---------------------------- + // Inter-group NoC mesh wiring + // ---------------------------- + + // East-West (horizontal) interior connections + for (genvar gx = 0; gx < NumGroupsX-1; gx++) begin : gen_ew_conn + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_ew_conn_y + // East output of (gx,gy) → West input of (gx+1,gy) + assign noc_req_in [gx+1 + gy*NumGroupsX][3] = noc_req_out [gx + gy*NumGroupsX][1]; + assign noc_req_in_valid[gx+1 + gy*NumGroupsX][3] = noc_req_out_valid[gx + gy*NumGroupsX][1]; + assign noc_req_out_ready[gx + gy*NumGroupsX][1] = noc_req_in_ready [gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_in [gx+1 + gy*NumGroupsX][3] = noc_rsp_out [gx + gy*NumGroupsX][1]; + assign noc_rsp_in_valid[gx+1 + gy*NumGroupsX][3] = noc_rsp_out_valid[gx + gy*NumGroupsX][1]; + assign noc_rsp_out_ready[gx + gy*NumGroupsX][1] = noc_rsp_in_ready [gx+1 + gy*NumGroupsX][3]; + // West output of (gx+1,gy) → East input of (gx,gy) + assign noc_req_in [gx + gy*NumGroupsX][1] = noc_req_out [gx+1 + gy*NumGroupsX][3]; + assign noc_req_in_valid[gx + gy*NumGroupsX][1] = noc_req_out_valid[gx+1 + gy*NumGroupsX][3]; + assign noc_req_out_ready[gx+1 + gy*NumGroupsX][3] = noc_req_in_ready[gx + gy*NumGroupsX][1]; + assign noc_rsp_in [gx + gy*NumGroupsX][1] = noc_rsp_out [gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_in_valid[gx + gy*NumGroupsX][1] = noc_rsp_out_valid[gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_out_ready[gx+1 + gy*NumGroupsX][3] = noc_rsp_in_ready[gx + gy*NumGroupsX][1]; + end + end + + // North-South (vertical) interior connections + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_ns_conn + for (genvar gy = 0; gy < NumGroupsY-1; gy++) begin : gen_ns_conn_y + // North output of (gx,gy) (dir 0) → South input of (gx,gy+1) (dir 2) + assign noc_req_in [gx + (gy+1)*NumGroupsX][2] = noc_req_out [gx + gy*NumGroupsX][0]; + assign noc_req_in_valid[gx + (gy+1)*NumGroupsX][2] = noc_req_out_valid[gx + gy*NumGroupsX][0]; + assign noc_req_out_ready[gx + gy *NumGroupsX][0] = noc_req_in_ready[gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_in [gx + (gy+1)*NumGroupsX][2] = noc_rsp_out [gx + gy*NumGroupsX][0]; + assign noc_rsp_in_valid[gx + (gy+1)*NumGroupsX][2] = noc_rsp_out_valid[gx + gy*NumGroupsX][0]; + assign noc_rsp_out_ready[gx + gy *NumGroupsX][0] = noc_rsp_in_ready[gx + (gy+1)*NumGroupsX][2]; + // South output of (gx,gy+1) (dir 2) → North input of (gx,gy) (dir 0) + assign noc_req_in [gx + gy *NumGroupsX][0] = noc_req_out [gx + (gy+1)*NumGroupsX][2]; + assign noc_req_in_valid[gx + gy *NumGroupsX][0] = noc_req_out_valid[gx + (gy+1)*NumGroupsX][2]; + assign noc_req_out_ready[gx + (gy+1)*NumGroupsX][2] = noc_req_in_ready[gx + gy *NumGroupsX][0]; + assign noc_rsp_in [gx + gy *NumGroupsX][0] = noc_rsp_out [gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_in_valid[gx + gy *NumGroupsX][0] = noc_rsp_out_valid[gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_out_ready[gx + (gy+1)*NumGroupsX][2] = noc_rsp_in_ready[gx + gy *NumGroupsX][0]; + end + end + + // West boundary: gx=0 has no West neighbor (dir 3) + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_west_bnd + assign noc_req_in [gy*NumGroupsX][3] = '0; + assign noc_req_in_valid[gy*NumGroupsX][3] = '0; + assign noc_req_out_ready[gy*NumGroupsX][3] = '1; + assign noc_rsp_in [gy*NumGroupsX][3] = '0; + assign noc_rsp_in_valid[gy*NumGroupsX][3] = '0; + assign noc_rsp_out_ready[gy*NumGroupsX][3] = '1; + end + + // East boundary: gx=NumGroupsX-1 has no East neighbor (dir 1) + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_east_bnd + assign noc_req_in [(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_req_in_valid[(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_req_out_ready[(NumGroupsX-1) + gy*NumGroupsX][1] = '1; + assign noc_rsp_in [(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_rsp_in_valid[(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_rsp_out_ready[(NumGroupsX-1) + gy*NumGroupsX][1] = '1; + end + + // South boundary: gy=0 has no South neighbor (dir 2) + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_south_bnd + assign noc_req_in [gx][2] = '0; + assign noc_req_in_valid[gx][2] = '0; + assign noc_req_out_ready[gx][2] = '1; + assign noc_rsp_in [gx][2] = '0; + assign noc_rsp_in_valid[gx][2] = '0; + assign noc_rsp_out_ready[gx][2] = '1; + end + + // North boundary: gy=NumGroupsY-1 has no North neighbor (dir 0) + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_north_bnd + assign noc_req_in [gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_req_in_valid[gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_req_out_ready[gx + (NumGroupsY-1)*NumGroupsX][0] = '1; + assign noc_rsp_in [gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_rsp_in_valid[gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_rsp_out_ready[gx + (NumGroupsY-1)*NumGroupsX][0] = '1; + end + // ------------- // To Main Memory: reqrsp_to_axi per group, then axi_mux across groups // ------------- diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index 5bf72af..eb89c99 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -136,6 +136,8 @@ module cachepool_group /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. input logic [9:0] hart_base_id_i, + /// Globally-unique tile ID of the first tile in this group (= group_index * NumTilesPerGroup). + input logic [TileIDWidth-1:0] tile_base_id_i, /// Base address of cluster. TCDM and cluster peripheral location are derived from /// it. This signal is pseudo-static. input axi_addr_t cluster_base_addr_i, @@ -190,7 +192,7 @@ module cachepool_group // --------- /// Minimum width to hold the core number. localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); - localparam int unsigned TileIDWidth = cf_math_pkg::idx_width(NumTiles); + // localparam int unsigned TileIDWidth = cf_math_pkg::idx_width(NumTiles); // Per-group overrides of package-level constants that depend on NumTiles/NumCores. localparam int unsigned NrCoresTileLocal = NrCores / NumTilesPerGroup; @@ -672,13 +674,10 @@ module cachepool_group assign tile_remote_in_rsp_valid[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].p_valid; assign tile_remote_in_req_ready[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].q_ready; - // Request selection: convert narrow tile_id to wide xbar index by appending - // core_id % NumRemotePortCore (available in the request channel user field) assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = local_remote_xbar_sel_t'( remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] * NumRemotePortCore + tile_remote_out_req_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); - // Response selection: recover xbar port from tile_id and core_id in response user field assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = local_remote_xbar_sel_t'( tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.tile_id * NumRemotePortCore + tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); @@ -691,7 +690,7 @@ module cachepool_group assign hart_base_id = hart_base_id_i + t * NumCoresTile; logic [TileIDWidth-1:0] tile_id; - assign tile_id = t; + assign tile_id = tile_base_id_i + TileIDWidth'(t); if (NumRemoteGroupPortCore == 0) begin : gen_tile cachepool_tile #( diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv index d64e3c0..b62ca36 100644 --- a/hardware/src/cachepool_group_noc_wrapper.sv +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -3,12 +3,8 @@ // SPDX-License-Identifier: SHL-0.51 // Description: Wrapper around cachepool_group that handles inter-group -// interconnection (mux/demux, flit packing, routers, receiving xbar). -// -// For now this is a pass-through wrapper with inter-group ports tied off, -// allowing the cluster to instantiate it in place of cachepool_group -// without functional change. The inter-group logic will be added -// incrementally. +// interconnection: master-side concentration xbar, flit packing, floo_router +// instances (req + rsp), and a slave-side dispatch xbar. // // Author: Diyou Shen @@ -28,111 +24,68 @@ module cachepool_group_noc_wrapper import cachepool_pkg::*; + import floo_pkg::*; import spatz_pkg::*; import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; import snitch_icache_pkg::icache_events_t; #( - /// Width of physical address. parameter int unsigned AxiAddrWidth = 48, - /// Width of AXI port. parameter int unsigned AxiDataWidth = 512, - /// AXI: id width in. parameter int unsigned AxiIdWidthIn = 2, - /// AXI: id width out. parameter int unsigned AxiIdWidthOut = 2, - /// AXI: user width. parameter int unsigned AxiUserWidth = 1, - /// Address from which to fetch the first instructions. parameter logic [31:0] BootAddr = 32'h0, - /// Address to indicate start of UART parameter logic [31:0] UartAddr = 32'h0, - /// The total amount of cores. parameter int unsigned NrCores = 0, - /// Data/TCDM memory depth per cut (in words). parameter int unsigned TCDMDepth = 1024, - /// Cluster peripheral address region size (in kB). parameter int unsigned ClusterPeriphSize = 64, - /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request FIFO. parameter int unsigned DMAReqFifoDepth = 3, - /// Width of a single icache line. parameter int unsigned ICacheLineWidth = 0, - /// Number of icache lines per set. parameter int unsigned ICacheLineCount = 0, - /// Number of icache sets. parameter int unsigned ICacheSets = 0, - /// Per-core enabling of the custom `Xdma` ISA extensions. parameter bit [NrCores-1:0] Xdma = '{default: '0}, - /// FPU configuration. parameter fpu_implementation_t FPUImplementation = fpu_implementation_t'(0), - /// Number of Spatz FPUs parameter int unsigned NumSpatzFPUs = 1, - /// Number of Spatz IPUs parameter int unsigned NumSpatzIPUs = 1, - /// Physical Memory Attributes Configuration parameter snitch_pma_t SnitchPMACfg = '0, - /// # Outstanding loads parameter int unsigned NumIntOutstandingLoads = 1, parameter int unsigned NumIntOutstandingMem = 4, parameter int unsigned NumSpatzOutstandingLoads = 4, - /// Insert Pipeline registers into off-loading path (roles) parameter bit RegisterOffloadRsp = 1, - /// Insert Pipeline registers into data cache request path parameter bit RegisterCoreReq = 0, - /// Insert Pipeline registers into data cache response path parameter bit RegisterCoreRsp = 0, - /// Insert Pipeline registers after each memory cut parameter bit RegisterTCDMCuts = 1'b0, - /// Decouple external AXI plug parameter bit RegisterExt = 1'b0, parameter axi_pkg::xbar_latency_e XbarLatency = axi_pkg::CUT_ALL_PORTS, - /// Outstanding transactions on the AXI network parameter int unsigned MaxMstTrans = 4, parameter int unsigned MaxSlvTrans = 4, - /// # Interface - /// AXI Ports parameter type axi_in_req_t = logic, parameter type axi_in_resp_t = logic, parameter type axi_narrow_req_t = logic, parameter type axi_narrow_resp_t = logic, parameter type axi_out_req_t = logic, parameter type axi_out_resp_t = logic, - /// SRAM configuration parameter type impl_in_t = logic, - // Memory latency parameter. parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, - /// # SRAM Configuration rules needed parameter int unsigned NrSramCfg = 1 ) ( - /// System clock. input logic clk_i, - /// Asynchronous active high reset. input logic rst_ni, - /// Per-core debug request signal. input logic [NrCores-1:0] debug_req_i, - /// Machine external interrupt pending. input logic [NrCores-1:0] meip_i, - /// Machine timer interrupt pending. input logic [NrCores-1:0] mtip_i, - /// Core software interrupt pending. input logic [NrCores-1:0] msip_i, - /// First hartid of the cluster. input logic [9:0] hart_base_id_i, - /// Base address of cluster. + input logic [TileIDWidth-1:0] tile_base_id_i, input axi_addr_t cluster_base_addr_i, - /// Partitioning address input axi_addr_t private_start_addr_i, - /// AXI Narrow out-port (UART/Peripheral) output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, - /// DRAM refill reqrsp ports (post-xbar, one per L2 channel) output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, - /// Peripheral signals output icache_events_t [NrCores-1:0] icache_events_o, input logic icache_prefetch_enable_i, input logic [NrCores-1:0] cl_interrupt_i, @@ -140,35 +93,490 @@ module cachepool_group_noc_wrapper input logic [3:0] l1d_private_i, input cache_insn_t l1d_insn_i, input logic l1d_insn_valid_i, - output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, - input logic [NumTilesPerGroup-1:0] l1d_busy_i, - /// SRAM Configuration + output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, + input logic [NumTilesPerGroup-1:0] l1d_busy_i, input impl_in_t [NrSramCfg-1:0] impl_i, - /// Indicate the program execution is error - output logic error_o + output logic error_o, + // XY coordinates of this group in the inter-group mesh + input group_xy_id_t group_xy_id_i, + // Inter-group req mesh: 4 directions (N=0,E=1,S=2,W=3) + // dim1: direction, dim2: tile*NumNoCPortsPerTile+channel + output noc_group_req_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_o, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_valid_o, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_ready_i, + input noc_group_req_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_i, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_valid_i, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_ready_o, + // Inter-group rsp mesh + output noc_group_rsp_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_o, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_valid_o, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_ready_i, + input noc_group_rsp_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_i, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_valid_i, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_ready_o ); // ------------------------------------------------------------------------- - // Inter-group remote signals + // Localparams // ------------------------------------------------------------------------- - // Total per-group inter-group port count. localparam int unsigned NumRemoteGroupPortTile = (NumRemoteGroupPortCore == 0) ? 1 : NumRemoteGroupPortCore * NrTCDMPortsPerCore; localparam int unsigned NumRemoteGroupPortGroup = NumRemoteGroupPortTile * NumTilesPerGroup; + localparam int unsigned NumNoCPortsGroup = NumNoCPortsPerTile * NumTilesPerGroup; + localparam int unsigned SlvXbarSelW = (NumRemoteGroupPortGroup > 1) ? $clog2(NumRemoteGroupPortGroup) : 1; + localparam int unsigned MstXbarSelW = (NumNoCPortsGroup > 1) ? $clog2(NumNoCPortsGroup) : 1; + + // -- Struct / xbar field widths (always >= 1 to avoid zero-width ports) ------ + localparam int unsigned NocCacheBankBits = $clog2(NrBanks); + localparam int unsigned NocAddrTileWidth = (NumTilesPerGroup > 1) ? $clog2(NumTilesPerGroup) : 1; + localparam int unsigned NocAddrXWidth = (NumGroupsX > 1) ? $clog2(NumGroupsX) : 1; + localparam int unsigned NocAddrYWidth = (NumGroupsY > 1) ? $clog2(NumGroupsY) : 1; + // -- Actual bit counts inside dst_tile_id (can be 0 when that dimension = 1) - + // dst_tile_id layout: [ group_y (NocGroupBitsY) | group_x (NocGroupBitsX) | local_tile (NocGroupOffset) ] + // where NocGroupOffset = $clog2(NumTilesPerGroup) (0 when NumTilesPerGroup == 1). + localparam int unsigned NocGroupOffset = $clog2(NumTilesPerGroup); + localparam int unsigned NocGroupBitsX = (NumGroupsX > 1) ? $clog2(NumGroupsX) : 0; + localparam int unsigned NocGroupBitsY = (NumGroupsY > 1) ? $clog2(NumGroupsY) : 0; + + // ------------------------------------------------------------------------- + // Group ↔ wrapper boundary signals + // ------------------------------------------------------------------------- remote_group_req_t [NumRemoteGroupPortGroup-1:0] remote_group_req_to_group; remote_group_rsp_t [NumRemoteGroupPortGroup-1:0] remote_group_rsp_from_group; remote_group_req_t [NumRemoteGroupPortGroup-1:0] remote_group_req_from_group; remote_group_rsp_t [NumRemoteGroupPortGroup-1:0] remote_group_rsp_to_group; - // Tie off incoming inter-group requests: no traffic from other groups (for now). - assign remote_group_req_to_group = '0; - assign remote_group_rsp_to_group = '0; + // ------------------------------------------------------------------------- + // Mesh signals [tile][ch][dir=3:0] and transposition to/from ports + // ------------------------------------------------------------------------- + noc_group_req_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out_ready; + noc_group_req_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in_ready; + + noc_group_rsp_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out_ready; + noc_group_rsp_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in_ready; + + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_mesh_trans_t + for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_mesh_trans_c + for (genvar d = 0; d < 4; d++) begin : gen_mesh_trans_d + assign noc_req_o[d][t*NumNoCPortsPerTile+c] = req_mesh_out[t][c][d]; + assign noc_req_valid_o[d][t*NumNoCPortsPerTile+c] = req_mesh_out_valid[t][c][d]; + assign req_mesh_out_ready[t][c][d] = noc_req_ready_i[d][t*NumNoCPortsPerTile+c]; + assign req_mesh_in[t][c][d] = noc_req_i[d][t*NumNoCPortsPerTile+c]; + assign req_mesh_in_valid[t][c][d] = noc_req_valid_i[d][t*NumNoCPortsPerTile+c]; + assign noc_req_ready_o[d][t*NumNoCPortsPerTile+c] = req_mesh_in_ready[t][c][d]; + + assign noc_rsp_o[d][t*NumNoCPortsPerTile+c] = rsp_mesh_out[t][c][d]; + assign noc_rsp_valid_o[d][t*NumNoCPortsPerTile+c] = rsp_mesh_out_valid[t][c][d]; + assign rsp_mesh_out_ready[t][c][d] = noc_rsp_ready_i[d][t*NumNoCPortsPerTile+c]; + assign rsp_mesh_in[t][c][d] = noc_rsp_i[d][t*NumNoCPortsPerTile+c]; + assign rsp_mesh_in_valid[t][c][d] = noc_rsp_valid_i[d][t*NumNoCPortsPerTile+c]; + assign noc_rsp_ready_o[d][t*NumNoCPortsPerTile+c] = rsp_mesh_in_ready[t][c][d]; + end + end + end + + + if (NumRemoteGroupPortCore > 0) begin : gen_noc + + // ----------------------------------------------------------------------- + // Router inject/eject signals (flat 1D index j = t*NumNoCPortsPerTile+c) + // ----------------------------------------------------------------------- + noc_group_req_t [NumNoCPortsGroup-1:0] packed_req; + logic [NumNoCPortsGroup-1:0] packed_req_valid; + logic [NumNoCPortsGroup-1:0] packed_req_ready; + + noc_group_req_t [NumNoCPortsGroup-1:0] eject_req; + logic [NumNoCPortsGroup-1:0] eject_req_valid; + logic [NumNoCPortsGroup-1:0] eject_req_ready; + + noc_group_rsp_t [NumNoCPortsGroup-1:0] inject_rsp; + logic [NumNoCPortsGroup-1:0] inject_rsp_valid; + logic [NumNoCPortsGroup-1:0] inject_rsp_ready; + + noc_group_rsp_t [NumNoCPortsGroup-1:0] eject_rsp; + logic [NumNoCPortsGroup-1:0] eject_rsp_valid; + logic [NumNoCPortsGroup-1:0] eject_rsp_ready; + + // Master xbar output (one concentrated req/rsp channel per tile/channel) + remote_group_req_chan_t [NumNoCPortsGroup-1:0] mst_xbar_req; + logic [NumNoCPortsGroup-1:0] mst_xbar_req_valid; + logic [NumNoCPortsGroup-1:0] mst_xbar_req_ready; + + // Slave xbar signals + noc_group_req_t [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req_valid; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req_ready; + noc_group_rsp_t [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp_valid; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp_ready; + noc_group_rsp_t [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp; + logic [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp_valid; + logic [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp_ready; + + logic [NumNoCPortsGroup-1:0][SlvXbarSelW-1:0] slv_xbar_slv_sel; + logic [NumRemoteGroupPortGroup-1:0][MstXbarSelW-1:0] slv_xbar_mst_sel; + + + // ----------------------------------------------------------------------- + // Master-side per-tile concentration xbar + flit packing + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_mst_t + + remote_group_req_chan_t [NumRemoteGroupPortTile-1:0] mst_slv_req; + logic [NumRemoteGroupPortTile-1:0] mst_slv_req_valid; + logic [NumRemoteGroupPortTile-1:0] mst_slv_req_ready; + remote_group_rsp_chan_t [NumRemoteGroupPortTile-1:0] mst_slv_rsp; + logic [NumRemoteGroupPortTile-1:0] mst_slv_rsp_valid; + logic [NumRemoteGroupPortTile-1:0] mst_slv_rsp_ready; + remote_group_rsp_chan_t [NumNoCPortsPerTile-1:0] eject_rsp_payload; + portid_t [NumNoCPortsPerTile-1:0] mst_xbar_mst_sel; + portid_t [NumNoCPortsPerTile-1:0] mst_xbar_slv_selected; + + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_mst_port_p + assign mst_slv_req[p] = remote_group_req_from_group[t*NumRemoteGroupPortTile+p].q; + assign mst_slv_req_valid[p] = remote_group_req_from_group[t*NumRemoteGroupPortTile+p].q_valid; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].q_ready = mst_slv_req_ready[p]; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].p = mst_slv_rsp[p]; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].p_valid = mst_slv_rsp_valid[p]; + assign mst_slv_rsp_ready[p] = + remote_group_req_from_group[t*NumRemoteGroupPortTile+p].p_ready; + end + + for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_mst_eject_c + localparam int unsigned J = t * NumNoCPortsPerTile + c; + assign eject_rsp_payload[c] = eject_rsp[J].payload; + assign mst_xbar_mst_sel[c] = eject_rsp[J].hdr.src_port_id; + end + + reqrsp_xbar #( + .NumInp ( NumRemoteGroupPortTile ), + .NumOut ( NumNoCPortsPerTile ), + .tcdm_req_chan_t ( remote_group_req_chan_t ), + .tcdm_rsp_chan_t ( remote_group_rsp_chan_t ) + ) i_noc_mst_xbar ( + .clk_i, + .rst_ni, + .slv_req_i ( mst_slv_req ), + .slv_rr_i ( '0 ), + .slv_req_valid_i ( mst_slv_req_valid ), + .slv_req_ready_o ( mst_slv_req_ready ), + .slv_rsp_o ( mst_slv_rsp ), + .slv_rsp_valid_o ( mst_slv_rsp_valid ), + .slv_rsp_ready_i ( mst_slv_rsp_ready ), + .slv_sel_i ( '0 ), + .slv_selected_o ( mst_xbar_slv_selected ), + .mst_req_o ( mst_xbar_req[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_req_valid_o ( mst_xbar_req_valid[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_req_ready_i ( mst_xbar_req_ready[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_rsp_i ( eject_rsp_payload ), + .mst_rr_i ( '0 ), + .mst_rsp_valid_i ( eject_rsp_valid[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_rsp_ready_o ( eject_rsp_ready[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_sel_i ( mst_xbar_mst_sel ) + ); + + for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_pack_c + localparam int unsigned J = t * NumNoCPortsPerTile + c; + assign packed_req[J].hdr.collective_op = '0; + assign packed_req[J].hdr.src_id = group_xy_id_i; + // dst_tile_id set by tcdm_cache_interco: bits [NocGroupOffset +: NocGroupBitsX] = group_x, + // bits [(NocGroupOffset+NocGroupBitsX) +: NocGroupBitsY] = group_y. + // When a dimension has only 1 group, no bits are consumed and the coordinate is 0. + if (NumGroupsX > 1) begin : gen_dst_x + assign packed_req[J].hdr.dst_id.x = + mst_xbar_req[J].user.dst_tile_id[NocGroupOffset +: NocGroupBitsX]; + end else begin : gen_dst_x + assign packed_req[J].hdr.dst_id.x = '0; + end + if (NumGroupsY > 1) begin : gen_dst_y + assign packed_req[J].hdr.dst_id.y = + mst_xbar_req[J].user.dst_tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; + end else begin : gen_dst_y + assign packed_req[J].hdr.dst_id.y = '0; + end + assign packed_req[J].hdr.dst_id.port_id = '0; + assign packed_req[J].hdr.src_tile_id = group_tile_sel_t'(t); + assign packed_req[J].hdr.src_port_id = mst_xbar_slv_selected[c]; + assign packed_req[J].hdr.last = 1'b1; + assign packed_req[J].payload = mst_xbar_req[J]; + assign packed_req_valid[J] = mst_xbar_req_valid[J]; + assign mst_xbar_req_ready[J] = packed_req_ready[J]; + +`ifndef TARGET_SYNTHESIS + initial begin + #100; + $display("[NOC_DBG] group(%0d,%0d) port J=%0d: dyn_off=%0d CacheBankBits=%0d TileWidth=%0d XWidth=%0d YWidth=%0d", + group_xy_id_i.x, group_xy_id_i.y, J, + dynamic_offset_i, NocCacheBankBits, NocAddrTileWidth, NocAddrXWidth, NocAddrYWidth); + end + always @(posedge clk_i) begin + if (packed_req_valid[J] && packed_req_ready[J]) begin + $display("[NOC_INJ] t=%0t dyn_off=%0d group(%0d,%0d) J=%0d addr=0x%08x dst(%0d,%0d) src(%0d,%0d)", + $time, dynamic_offset_i, + group_xy_id_i.x, group_xy_id_i.y, J, + mst_xbar_req[J].addr, + packed_req[J].hdr.dst_id.x, packed_req[J].hdr.dst_id.y, + packed_req[J].hdr.src_id.x, packed_req[J].hdr.src_id.y); + end + end +`endif + end + + end : gen_mst_t + + + // ----------------------------------------------------------------------- + // Per-tile per-channel req floo_router + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_req_router_t + for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_req_router_c + localparam int unsigned J = t * NumNoCPortsPerTile + c; + floo_router #( + .NumRoutes ( 5 ), + .NumVirtChannels ( 1 ), + .NumPhysChannels ( 1 ), + .InFifoDepth ( 2 ), + .OutFifoDepth ( 0 ), + .RouteAlgo ( XYRouting ), + .IdWidth ( $bits(group_xy_id_t) ), + .id_t ( group_xy_id_t ), + .NumAddrRules ( 1 ), + .addr_rule_t ( logic ), + .flit_t ( noc_group_req_t ), + .hdr_t ( noc_group_hdr_t ) + ) i_req_router ( + .clk_i, + .rst_ni, + .test_enable_i ( 1'b0 ), + .xy_id_i ( group_xy_id_i ), + .id_route_map_i ( '0 ), + .valid_i ( {packed_req_valid[J], + req_mesh_in_valid[t][c][3:0]} ), + .ready_o ( {packed_req_ready[J], + req_mesh_in_ready[t][c][3:0]} ), + .data_i ( {packed_req[J], + req_mesh_in[t][c][3:0]} ), + .credit_o ( ), + .valid_o ( {eject_req_valid[J], + req_mesh_out_valid[t][c][3:0]} ), + .ready_i ( {eject_req_ready[J], + req_mesh_out_ready[t][c][3:0]} ), + .data_o ( {eject_req[J], + req_mesh_out[t][c][3:0]} ), + .credit_i ( '1 ), + .offload_req_o ( ), + .offload_rsp_i ( '0 ) + ); + end + end + + + // ----------------------------------------------------------------------- + // Per-tile per-channel rsp floo_router + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_rsp_router_t + for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_rsp_router_c + localparam int unsigned J = t * NumNoCPortsPerTile + c; + floo_router #( + .NumRoutes ( 5 ), + .NumVirtChannels ( 1 ), + .NumPhysChannels ( 1 ), + .InFifoDepth ( 2 ), + .OutFifoDepth ( 0 ), + .RouteAlgo ( XYRouting ), + .IdWidth ( $bits(group_xy_id_t) ), + .id_t ( group_xy_id_t ), + .NumAddrRules ( 1 ), + .addr_rule_t ( logic ), + .flit_t ( noc_group_rsp_t ), + .hdr_t ( noc_group_hdr_t ) + ) i_rsp_router ( + .clk_i, + .rst_ni, + .test_enable_i ( 1'b0 ), + .xy_id_i ( group_xy_id_i ), + .id_route_map_i ( '0 ), + .valid_i ( {inject_rsp_valid[J], + rsp_mesh_in_valid[t][c][3:0]} ), + .ready_o ( {inject_rsp_ready[J], + rsp_mesh_in_ready[t][c][3:0]} ), + .data_i ( {inject_rsp[J], + rsp_mesh_in[t][c][3:0]} ), + .credit_o ( ), + .valid_o ( {eject_rsp_valid[J], + rsp_mesh_out_valid[t][c][3:0]} ), + .ready_i ( {eject_rsp_ready[J], + rsp_mesh_out_ready[t][c][3:0]} ), + .data_o ( {eject_rsp[J], + rsp_mesh_out[t][c][3:0]} ), + .credit_i ( '1 ), + .offload_req_o ( ), + .offload_rsp_i ( '0 ) + ); + end + end + + + // ----------------------------------------------------------------------- + // Slave xbar selection signals + inject_rsp ↔ slv_xbar_slv_rsp + // ----------------------------------------------------------------------- + for (genvar j = 0; j < NumNoCPortsGroup; j++) begin : gen_slv_sel_j + assign slv_xbar_slv_sel[j] = (NumTilesPerGroup == 1) + ? SlvXbarSelW'(eject_req[j].hdr.src_port_id) + : SlvXbarSelW'(eject_req[j].payload.addr[(dynamic_offset_i + NocCacheBankBits) +: NocAddrTileWidth] + * NumRemoteGroupPortTile + + eject_req[j].hdr.src_port_id); + +`ifndef TARGET_SYNTHESIS + always @(posedge clk_i) begin + if (eject_req_valid[j] && eject_req_ready[j]) begin + $display("[SLV_EJECT_OK] t=%0t group(%0d,%0d) j=%0d addr=0x%08x tile_bits=%0d sel=%0d", + $time, group_xy_id_i.x, group_xy_id_i.y, j, + eject_req[j].payload.addr, + eject_req[j].payload.addr[(dynamic_offset_i + NocCacheBankBits) +: NocAddrTileWidth], + slv_xbar_slv_sel[j]); + end + if (eject_req_valid[j] && !eject_req_ready[j]) begin + $display("[SLV_EJECT_STALL] t=%0t group(%0d,%0d) j=%0d addr=0x%08x sel=%0d", + $time, group_xy_id_i.x, group_xy_id_i.y, j, + eject_req[j].payload.addr, slv_xbar_slv_sel[j]); + end + end +`endif + end + + assign inject_rsp = slv_xbar_slv_rsp; + assign inject_rsp_valid = slv_xbar_slv_rsp_valid; + assign slv_xbar_slv_rsp_ready = inject_rsp_ready; + + + // ----------------------------------------------------------------------- + // Slave-side group-wide dispatch xbar + // ----------------------------------------------------------------------- + reqrsp_xbar #( + .NumInp ( NumNoCPortsGroup ), + .NumOut ( NumRemoteGroupPortGroup ), + .tcdm_req_chan_t ( noc_group_req_t ), + .tcdm_rsp_chan_t ( noc_group_rsp_t ) + ) i_noc_slv_xbar ( + .clk_i, + .rst_ni, + .slv_req_i ( eject_req ), + .slv_rr_i ( '0 ), + .slv_req_valid_i ( eject_req_valid ), + .slv_req_ready_o ( eject_req_ready ), + .slv_rsp_o ( slv_xbar_slv_rsp ), + .slv_rsp_valid_o ( slv_xbar_slv_rsp_valid ), + .slv_rsp_ready_i ( slv_xbar_slv_rsp_ready ), + .slv_sel_i ( slv_xbar_slv_sel ), + .slv_selected_o ( ), + .mst_req_o ( slv_xbar_mst_req ), + .mst_req_valid_o ( slv_xbar_mst_req_valid ), + .mst_req_ready_i ( slv_xbar_mst_req_ready ), + .mst_rsp_i ( slv_xbar_mst_rsp ), + .mst_rr_i ( '0 ), + .mst_rsp_valid_i ( slv_xbar_mst_rsp_valid ), + .mst_rsp_ready_o ( slv_xbar_mst_rsp_ready ), + .mst_sel_i ( slv_xbar_mst_sel ) + ); + + + // ----------------------------------------------------------------------- + // Slave delivery: unpack xbar output → group slave ports + rsp packing + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_slv_deliver_t + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_slv_deliver_p + localparam int unsigned J = t * NumRemoteGroupPortTile + p; + localparam int unsigned SLV = t * NumRemoteGroupPortTile + p; + + // Placeholder response routing: route response back via the NoC channel + // of the same tile (t). Correct cross-tile response routing is deferred. + assign slv_xbar_mst_sel[J] = MstXbarSelW'(t * NumNoCPortsPerTile); + + always_comb begin : proc_req_unpack + remote_group_req_to_group[SLV].q = slv_xbar_mst_req[J].payload; + remote_group_req_to_group[SLV].q.user.src_group_x = + slv_xbar_mst_req[J].hdr.src_id.x; + remote_group_req_to_group[SLV].q.user.src_group_y = + slv_xbar_mst_req[J].hdr.src_id.y; + end + + assign remote_group_req_to_group[SLV].q_valid = slv_xbar_mst_req_valid[J]; + assign slv_xbar_mst_req_ready[J] = + remote_group_rsp_from_group[SLV].q_ready; + assign remote_group_req_to_group[SLV].p_ready = slv_xbar_mst_rsp_ready[J]; + +`ifndef TARGET_SYNTHESIS + always @(posedge clk_i) begin + if (slv_xbar_mst_req_valid[J] && slv_xbar_mst_req_ready[J]) begin + $display("[SLV_DELIVER_OK] t=%0t group(%0d,%0d) J=%0d SLV=%0d addr=0x%08x", + $time, group_xy_id_i.x, group_xy_id_i.y, J, SLV, + slv_xbar_mst_req[J].payload.addr); + end + if (slv_xbar_mst_req_valid[J] && !slv_xbar_mst_req_ready[J]) begin + $display("[SLV_DELIVER_STALL] t=%0t group(%0d,%0d) J=%0d SLV=%0d l1d_busy=%0b q_ready=%0b", + $time, group_xy_id_i.x, group_xy_id_i.y, J, SLV, + l1d_busy_i[t], remote_group_rsp_from_group[SLV].q_ready); + end + end +`endif + + assign slv_xbar_mst_rsp[J].payload = + remote_group_rsp_from_group[SLV].p; + assign slv_xbar_mst_rsp[J].hdr.collective_op = '0; + assign slv_xbar_mst_rsp[J].hdr.src_id = group_xy_id_i; + if (NumGroupsX > 1) begin : gen_rsp_dst_x + assign slv_xbar_mst_rsp[J].hdr.dst_id.x = + remote_group_rsp_from_group[SLV].p.user.tile_id[NocGroupOffset +: NocGroupBitsX]; + end else begin : gen_rsp_dst_x + assign slv_xbar_mst_rsp[J].hdr.dst_id.x = '0; + end + if (NumGroupsY > 1) begin : gen_rsp_dst_y + assign slv_xbar_mst_rsp[J].hdr.dst_id.y = + remote_group_rsp_from_group[SLV].p.user.tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; + end else begin : gen_rsp_dst_y + assign slv_xbar_mst_rsp[J].hdr.dst_id.y = '0; + end + assign slv_xbar_mst_rsp[J].hdr.dst_id.port_id = '0; + assign slv_xbar_mst_rsp[J].hdr.src_tile_id = group_tile_sel_t'(t); + assign slv_xbar_mst_rsp[J].hdr.src_port_id = remote_group_rsp_from_group[SLV].p.user.port_id; + assign slv_xbar_mst_rsp[J].hdr.last = 1'b1; + assign slv_xbar_mst_rsp_valid[J] = + remote_group_rsp_from_group[SLV].p_valid; + end + end + + + end else begin : gen_noc_disabled + + assign remote_group_req_to_group = '0; + assign remote_group_rsp_to_group = '0; + assign req_mesh_out = '0; + assign req_mesh_out_valid = '0; + assign req_mesh_in_ready = '0; + assign rsp_mesh_out = '0; + assign rsp_mesh_out_valid = '0; + assign rsp_mesh_in_ready = '0; + + end - // ------------------------------------------------------------------------- // Group instantiation // ------------------------------------------------------------------------- @@ -212,8 +620,8 @@ module cachepool_group_noc_wrapper .MaxMstTrans ( MaxMstTrans ), .MaxSlvTrans ( MaxSlvTrans ) ) i_group ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), + .clk_i, + .rst_ni, .impl_i ( impl_i ), .error_o ( error_o ), .debug_req_i ( debug_req_i ), @@ -221,19 +629,17 @@ module cachepool_group_noc_wrapper .mtip_i ( mtip_i ), .msip_i ( msip_i ), .hart_base_id_i ( hart_base_id_i ), + .tile_base_id_i ( tile_base_id_i ), .cluster_base_addr_i ( cluster_base_addr_i ), .private_start_addr_i ( private_start_addr_i ), .axi_narrow_req_o ( axi_narrow_req_o ), .axi_narrow_rsp_i ( axi_narrow_rsp_i ), - // DRAM refill reqrsp (post-xbar, one per L2 channel) .l2_req_o ( l2_req_o ), .l2_rsp_i ( l2_rsp_i ), - // Inter-group remote ports (tied off for now) .remote_group_req_o ( remote_group_req_from_group ), .remote_group_rsp_i ( remote_group_rsp_to_group ), .remote_group_req_i ( remote_group_req_to_group ), .remote_group_rsp_o ( remote_group_rsp_from_group ), - // Peripherals .icache_events_o ( icache_events_o ), .icache_prefetch_enable_i ( icache_prefetch_enable_i ), .cl_interrupt_i ( cl_interrupt_i ), diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index 07bdf0f..d26d7d5 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -78,6 +78,13 @@ package cachepool_pkg; // How many remote group ports for each tile? localparam int unsigned NumRemoteGroupPortCore = `ifdef RG_PORT_PER_CORE `RG_PORT_PER_CORE `else 0 `endif; + // Number of inter-group NoC router channels per tile (x in the 5-to-x concentration xbar). + localparam int unsigned NumNoCPortsPerTile = `ifdef NOC_PORT_PER_TILE `NOC_PORT_PER_TILE `else 1 `endif; + + // Group mesh dimensions. NumGroupsY is derived; NumGroupsX must be set via config. + localparam int unsigned NumGroupsX = `ifdef NUM_GROUPS_X `NUM_GROUPS_X `else 1 `endif; + localparam int unsigned NumGroupsY = NumGroups / NumGroupsX; + //////////////////// // CLUSTER HW // @@ -394,20 +401,52 @@ package cachepool_pkg; typedef logic [$clog2(NrTCDMPortsPerCore)-1:0] portid_t; typedef struct packed { - // sender core within tile - logic [CoreIDWidth-1:0] core_id; - // sender tile (globally unique) - logic [TileIDWidth-1:0] tile_id; - // outstanding request ID - reqid_t req_id; - // FPU path indicator - logic is_fpu; - // interco instance index (for demux) - portid_t port_id; + logic [CoreIDWidth-1:0] core_id; + logic [TileIDWidth-1:0] tile_id; + reqid_t req_id; + logic is_fpu; + portid_t port_id; + logic [idx_width(NumGroupsX)-1:0] src_group_x; + logic [idx_width(NumGroupsY)-1:0] src_group_y; + // Globally-unique destination tile ID, set by tcdm_cache_interco for + // inter-group requests. Upper bits (above $clog2(NumTilesPerGroup)) are + // the linear group index; lower bits are the local tile within the group. + logic [TileIDWidth-1:0] dst_tile_id; } remote_group_user_t; `REQRSP_TYPEDEF_ALL(remote_group, narrow_addr_t, narrow_data_t, narrow_strb_t, remote_group_user_t) + // XY mesh coordinates for a group. port_id selects the eject port (always 0 for single-link). + typedef struct packed { + logic [idx_width(NumGroupsX)-1:0] x; + logic [idx_width(NumGroupsY)-1:0] y; + logic port_id; + } group_xy_id_t; + + // Per-group tile index used by dispatch xbar selection. + typedef logic [idx_width(NumTilesPerGroup)-1:0] group_tile_sel_t; + + // Routing header embedded in every inter-group NoC flit. + typedef struct packed { + logic [3:0] collective_op; + group_xy_id_t src_id; + group_xy_id_t dst_id; + group_tile_sel_t src_tile_id; + portid_t src_port_id; + logic last; + } noc_group_hdr_t; + + // Inter-group NoC flit types (payload + routing header). + typedef struct packed { + remote_group_req_chan_t payload; + noc_group_hdr_t hdr; + } noc_group_req_t; + + typedef struct packed { + remote_group_rsp_chan_t payload; + noc_group_hdr_t hdr; + } noc_group_rsp_t; + ///////////////////// // CLUSTER TYPES // diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 9369942..c1b5748 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -692,12 +692,13 @@ module cachepool_tile strb: rg_interco_out_req[flat].q.strb, amo: rg_interco_out_req[flat].q.amo, user: '{ - core_id: rg_interco_out_req[flat].q.user.core_id, - tile_id: rg_interco_out_req[flat].q.user.tile_id, - req_id: rg_interco_out_req[flat].q.user.req_id, - is_fpu: rg_interco_out_req[flat].q.user.is_fpu, - port_id: portid_t'(j), - default: '0 + core_id: rg_interco_out_req[flat].q.user.core_id, + tile_id: rg_interco_out_req[flat].q.user.tile_id, + req_id: rg_interco_out_req[flat].q.user.req_id, + is_fpu: rg_interco_out_req[flat].q.user.is_fpu, + port_id: portid_t'(j), + dst_tile_id: rg_interco_out_dst[flat], + default: '0 }, default: '0 }, diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv index f57397e..b989b08 100644 --- a/hardware/src/tcdm_cache_interco.sv +++ b/hardware/src/tcdm_cache_interco.sv @@ -536,4 +536,21 @@ module tcdm_cache_interco #( assign mem_rsp_ready_o = mem_rsp_ready; +`ifndef TARGET_SYNTHESIS + // DEBUG: print inter-group request handshakes with decoded group ID + for (genvar rg = 0; rg < NumRemoteGroupPort; rg++) begin : gen_dbg_rg + localparam int unsigned P = NumCache + NumRemotePort + rg; + always @(posedge clk_i) begin + if (mem_req_valid[P] && mem_req_ready[P]) begin + $display("[INTERCO_RG] t=%0t tile=%0d rg=%0d dyn_off=%0d addr=0x%08x tile_id_field=%0d group_id=%0d CacheBankBits=%0d LocalTileBits=%0d GroupBits=%0d", + $time, tile_id_i, rg, dynamic_offset_i, + mem_req[P].addr, + mem_req[P].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth], + mem_req[P].addr[(dynamic_offset_i + CacheBankBits + LocalTileBits) +: (TileIDWidth - LocalTileBits)], + CacheBankBits, LocalTileBits, TileIDWidth - LocalTileBits); + end + end + end +`endif + endmodule From f0f3af483e5496a1cb46a018d5ad5a84e2029665 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 14 May 2026 09:50:51 +0200 Subject: [PATCH 10/37] [MISC] Update wave scripts and bootrom --- hardware/bootrom/bootdata.cc | 4 ++-- hardware/bootrom/bootdata_bootrom.cc | 4 ++-- hardware/bootrom/bootrom.bin | Bin 136 -> 136 bytes hardware/bootrom/bootrom.dump | 4 ++-- hardware/bootrom/bootrom.elf | Bin 5248 -> 5248 bytes hardware/bootrom/bootrom.sv | 4 ++-- sim/scripts/vsim_tile.tcl | 32 +++++++++++++-------------- sim/scripts/vsim_wave.tcl | 13 ++++++----- 8 files changed, 32 insertions(+), 29 deletions(-) diff --git a/hardware/bootrom/bootdata.cc b/hardware/bootrom/bootdata.cc index 9703ee8..7955d62 100644 --- a/hardware/bootrom/bootdata.cc +++ b/hardware/bootrom/bootdata.cc @@ -7,13 +7,13 @@ namespace sim { const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 64, + .core_count = 32, .hartid_base = 0, .tcdm_start = 0xbffff800, .tcdm_size = 0x800, .tcdm_offset = 0x0, .global_mem_start = 0x80000000, .global_mem_end = 0xa0000000, - .tile_count = 16}; + .tile_count = 8}; } // namespace sim diff --git a/hardware/bootrom/bootdata_bootrom.cc b/hardware/bootrom/bootdata_bootrom.cc index 2c18278..950bc0a 100644 --- a/hardware/bootrom/bootdata_bootrom.cc +++ b/hardware/bootrom/bootdata_bootrom.cc @@ -18,11 +18,11 @@ struct BootData { }; extern "C" const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 64, + .core_count = 32, .hartid_base = 0, .tcdm_start = 0xbffff800, .tcdm_size = 0x800, .tcdm_offset = 0x0, .global_mem_start = 0x80000000, .global_mem_end = 0xa0000000, - .tile_count = 16}; + .tile_count = 8}; diff --git a/hardware/bootrom/bootrom.bin b/hardware/bootrom/bootrom.bin index 01c26acb7246982415de0a8ab2aad76d2fb498d8..24326b49d008c2015b19d9529061dd4d0593660c 100755 GIT binary patch delta 16 XcmeBR>|mVW!l*FORhf}vVx~L*B=`g) delta 16 XcmeBR>|mVW!ssy3Rhdy>Vx~L*CO8C3 diff --git a/hardware/bootrom/bootrom.dump b/hardware/bootrom/bootrom.dump index 3000779..6ed2b86 100644 --- a/hardware/bootrom/bootrom.dump +++ b/hardware/bootrom/bootrom.dump @@ -29,7 +29,7 @@ Disassembly of section .rodata: 00001040 : 1040: 1000 .2byte 0x1000 1042: 0000 .2byte 0x0 - 1044: 0040 .2byte 0x40 + 1044: 0020 .2byte 0x20 1046: 0000 .2byte 0x0 1048: 0000 .2byte 0x0 104a: 0000 .2byte 0x0 @@ -44,7 +44,7 @@ Disassembly of section .rodata: 1062: a000 .2byte 0xa000 1064: 0000 .2byte 0x0 1066: 0000 .2byte 0x0 - 1068: 0010 .2byte 0x10 + 1068: 0008 .2byte 0x8 106a: 0000 .2byte 0x0 106c: 0000 .2byte 0x0 ... diff --git a/hardware/bootrom/bootrom.elf b/hardware/bootrom/bootrom.elf index dce1406ce0f47c856466c15eaaf183326b74fa7b..cfb4fa808e615843c403329f452281211e6a6e60 100755 GIT binary patch delta 32 ncmZqBY|z}`BEYCH*;PQ9kz;eFz#>Mrj7s0g{NT-kLitPpl*$P4 delta 32 ncmZqBY|z}`BEaY{*;PQ9QDAeXz#>LAkBVdqAK%S_LitPpm6Zs@ diff --git a/hardware/bootrom/bootrom.sv b/hardware/bootrom/bootrom.sv index 8bed1aa..9ae34b1 100644 --- a/hardware/bootrom/bootrom.sv +++ b/hardware/bootrom/bootrom.sv @@ -21,9 +21,9 @@ module bootrom #( const logic [RomSize-1:0][DataWidth-1:0] mem = { 128'h00001040000010380000000000001038, - 128'h000000000000001000000000a0000000, + 128'h000000000000000800000000a0000000, 128'h00000000800000000000000000000800, - 128'hbffff800000000000000004000001000, + 128'hbffff800000000000000002000001000, 128'hffdff06f10500073000380670003a383, 128'h0203839301c383b30105ae0300c5a383, 128'h105000733047d07306c5a58300000597, diff --git a/sim/scripts/vsim_tile.tcl b/sim/scripts/vsim_tile.tcl index 8763440..09e5938 100644 --- a/sim/scripts/vsim_tile.tcl +++ b/sim/scripts/vsim_tile.tcl @@ -5,16 +5,16 @@ # Create group for Tile $1 onerror {resume} -set tile_path $2 +set tile_path $3 # Add waves for tcdm_mapper and csrs -# add wave -noupdate -group tile[$1] -group Barrier ${tile_path}/i_tile/i_snitch_barrier/* -# add wave -noupdate -group tile[$1] -group axi2reqrsp ${tile_path}/i_axi2reqrsp/* +# add wave -noupdate -group group[$2] -group tile[$1] -group Barrier ${tile_path}/i_tile/i_snitch_barrier/* +# add wave -noupdate -group group[$2] -group tile[$1] -group axi2reqrsp ${tile_path}/i_axi2reqrsp/* # Add waves for xbars -add wave -noupdate -group tile[$1] -group narrow_xbar ${tile_path}/i_tile/i_axi_narrow_xbar/* -add wave -noupdate -group tile[$1] -group wide_xbar ${tile_path}/i_tile/i_axi_wide_xbar/* +add wave -noupdate -group group[$2] -group tile[$1] -group narrow_xbar ${tile_path}/i_tile/i_axi_narrow_xbar/* +add wave -noupdate -group group[$2] -group tile[$1] -group wide_xbar ${tile_path}/i_tile/i_axi_wide_xbar/* -add wave -noupdate -group Barrier -group tile[$1] ${tile_path}/i_tile/i_cachepool_tile_barrier/* +add wave -noupdate -group Barrier -group group[$2] -group tile[$1] ${tile_path}/i_tile/i_cachepool_tile_barrier/* # Add waves for cache controller for {set c 0} {$c < 4} {incr c} { @@ -22,21 +22,21 @@ for {set c 0} {$c < 4} {incr c} { set cache_path ${tile_path}/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller - add wave -noupdate -group tile[$1] -group cache[$c] -group amo ${tile_path}/i_tile/gen_cache_connect[$c]/gen_cache_amo_connect[4]/gen_amo/i_cache_amo/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group amo ${tile_path}/i_tile/gen_cache_connect[$c]/gen_cache_amo_connect[4]/gen_amo/i_cache_amo/* - add wave -noupdate -group tile[$1] -group cache[$c] -group coalescer ${cache_path}/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* - add wave -noupdate -group tile[$1] -group cache[$c] -group core ${cache_path}/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl0 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[0]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl1 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[1]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl2 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[2]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl3 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[3]/i_access_ctrl_for_meta/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group coalescer ${cache_path}/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group core ${cache_path}/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group meta_ctrl0 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[0]/i_access_ctrl_for_meta/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group meta_ctrl1 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[1]/i_access_ctrl_for_meta/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group meta_ctrl2 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[2]/i_access_ctrl_for_meta/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group meta_ctrl3 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[3]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group Internal ${cache_path}/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group Internal ${cache_path}/* } for {set c 0} {$c < 5} {incr c} { - add wave -noupdate -group tile[$1] -group cache_xbar -group xbar[$c] ${tile_path}/i_tile/gen_cache_xbar[$c]/i_cache_xbar/* + add wave -noupdate -group group[$2] -group tile[$1] -group cache_xbar -group xbar[$c] ${tile_path}/i_tile/gen_cache_xbar[$c]/gen_remote_group_slice/i_cache_xbar/* } # Add waves for remaining signals -add wave -noupdate -group tile[$1] -group Internal ${tile_path}/i_tile/* +add wave -noupdate -group group[$2] -group tile[$1] -group Internal ${tile_path}/i_tile/* diff --git a/sim/scripts/vsim_wave.tcl b/sim/scripts/vsim_wave.tcl index 994f504..723184b 100644 --- a/sim/scripts/vsim_wave.tcl +++ b/sim/scripts/vsim_wave.tcl @@ -27,17 +27,17 @@ for {set g 0} {$g < $NUM_GROUPS} {incr g} { do sim/scripts/vsim_group.tcl ${group_path} 5 # Conditional plotting based on the group - if {$g == 0} { + if {$g <= 1} { # 2. Call to plot tile 0 and tile 3 for Group 0 only - foreach tile {0 3} { + foreach tile {0 1 2 3} { set tile_path ${group_path}/gen_tiles[$tile]/gen_tile - do sim/scripts/vsim_tile.tcl $tile ${tile_path} + do sim/scripts/vsim_tile.tcl $tile $g ${tile_path} # 3. Plot all cores in the plotted tile for {set core 0} {$core < $NUM_CORES} {incr core} { set core_path ${tile_path}/i_tile/gen_core[$core] # Pass an empty string to indicate NO parent group - do sim/scripts/vsim_core.tcl 0 $tile $core ${core_path} "" + do sim/scripts/vsim_core.tcl $g $tile $core ${core_path} "" } } } else { @@ -47,9 +47,12 @@ for {set g 0} {$g < $NUM_GROUPS} {incr g} { set tile_path ${group_path}/gen_tiles[$tile]/gen_tile set core_path ${tile_path}/i_tile/gen_core[$core] - # FIX: Use 'do' instead of 'source' and pass just the parent group name do sim/scripts/vsim_core.tcl $g $tile $core ${core_path} "GroupWP_$g" } + # set group_wp_path ${cluster_path}/gen_group[1]/i_group + # set group_path ${group_wp_path}/i_group + # set tile_path ${group_path}/gen_tiles[2]/gen_tile + # do sim/scripts/vsim_tile.tcl 2 ${tile_path} } # Add DRAM waves once at the end From af749950c40d47f9c2170ba572d13608b3a83649 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 14 May 2026 09:51:16 +0200 Subject: [PATCH 11/37] [SRC] Add generated noc package --- hardware/generated/floo_cachepool_noc_pkg.sv | 240 +++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 hardware/generated/floo_cachepool_noc_pkg.sv diff --git a/hardware/generated/floo_cachepool_noc_pkg.sv b/hardware/generated/floo_cachepool_noc_pkg.sv new file mode 100644 index 0000000..e475728 --- /dev/null +++ b/hardware/generated/floo_cachepool_noc_pkg.sv @@ -0,0 +1,240 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// AUTOMATICALLY GENERATED! DO NOT EDIT! + +`include "axi/typedef.svh" +`include "floo_noc/typedef.svh" + +package floo_cachepool_noc_pkg; + + import floo_pkg::*; + + ///////////////////// + // Address Map // + ///////////////////// + + typedef enum logic[3:0] { + GroupX0Y0 = 0, + GroupX0Y1 = 1, + GroupX1Y0 = 2, + GroupX1Y1 = 3, + Hbm0 = 4, + Hbm1 = 5, + Hbm2 = 6, + Hbm3 = 7, + HostPeri = 8, + NumEndpoints = 9} ep_id_e; + + + + typedef enum logic[2:0] { + Hbm0SamIdx = 0, + Hbm1SamIdx = 1, + Hbm2SamIdx = 2, + Hbm3SamIdx = 3, + HostPeriSamIdx = 5} sam_idx_e; + + + + typedef logic[0:0] rob_idx_t; +typedef logic[0:0] port_id_t; +typedef logic[3:0] id_t; +typedef logic[8:0] route_t; + + + typedef struct packed { + id_t idx; + id_t start_addr; + id_t end_addr; + } route_map_rule_t; + + localparam int unsigned SamNumRules = 6; + +typedef struct packed { + id_t idx; + logic [31:0] start_addr; + logic [31:0] end_addr; +} sam_rule_t; + +localparam sam_rule_t[SamNumRules-1:0] Sam = '{ +'{ idx: 8, + start_addr: 32'h00000000, + end_addr: 32'h7fffffff},// HostPeri +'{ idx: 8, + start_addr: 32'ha0000000, + end_addr: 32'hc000ffff},// HostPeri +'{ idx: 7, + start_addr: 32'h80300000, + end_addr: 32'h80400000},// Hbm3 +'{ idx: 6, + start_addr: 32'h80200000, + end_addr: 32'h80300000},// Hbm2 +'{ idx: 5, + start_addr: 32'h80100000, + end_addr: 32'h80200000},// Hbm1 +'{ idx: 4, + start_addr: 32'h80000000, + end_addr: 32'h80100000} // Hbm0 + +}; + + + localparam route_t[NumEndpoints-1:0][NumEndpoints-1:0] RoutingTables = '{ +'{ +9'b000000000,// -> host_peri_ni +9'b001001000,// -> hbm_ni_3 +9'b000001001,// -> hbm_ni_2 +9'b000011000,// -> hbm_ni_1 +9'b000000011,// -> hbm_ni_0 +9'b100001000,// -> group_ni_1_1 +9'b000100001,// -> group_ni_1_0 +9'b000100000,// -> group_ni_0_1 +9'b000000100 // -> group_ni_0_0 +}, +'{ +9'b010010011,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000000100,// -> group_ni_1_1 +9'b000100010,// -> group_ni_1_0 +9'b000100011,// -> group_ni_0_1 +9'b100010011 // -> group_ni_0_0 +}, +'{ +9'b000010011,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000100000,// -> group_ni_1_1 +9'b000000100,// -> group_ni_1_0 +9'b100000011,// -> group_ni_0_1 +9'b000100011 // -> group_ni_0_0 +}, +'{ +9'b000010010,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000100001,// -> group_ni_1_1 +9'b100001010,// -> group_ni_1_0 +9'b000000100,// -> group_ni_0_1 +9'b000100010 // -> group_ni_0_0 +}, +'{ +9'b000000010,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b100001000,// -> group_ni_1_1 +9'b000100001,// -> group_ni_1_0 +9'b000100000,// -> group_ni_0_1 +9'b000000100 // -> group_ni_0_0 +}, +'{ +9'b010010011,// -> host_peri_ni +9'b000000001,// -> hbm_ni_3 +9'b000001010,// -> hbm_ni_2 +9'b000011011,// -> hbm_ni_1 +9'b011010011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000010011,// -> host_peri_ni +9'b000001000,// -> hbm_ni_3 +9'b000000001,// -> hbm_ni_2 +9'b011000011,// -> hbm_ni_1 +9'b000011011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000010010,// -> host_peri_ni +9'b000001001,// -> hbm_ni_3 +9'b001001010,// -> hbm_ni_2 +9'b000000011,// -> hbm_ni_1 +9'b000011010,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000000010,// -> host_peri_ni +9'b001001000,// -> hbm_ni_3 +9'b000001001,// -> hbm_ni_2 +9'b000011000,// -> hbm_ni_1 +9'b000000011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}} +; + + + localparam route_cfg_t RouteCfg = '{ RouteAlgo: SourceRouting, + UseIdTable: 1'b1, + XYAddrOffsetX: 0, + XYAddrOffsetY: 0, + IdAddrOffset: 0, + NumSamRules: 6, + NumRoutes: 9, + CollectiveCfg: '{ OpCfg: '{ EnNarrowMulticast: 1'b0, + EnWideMulticast: 1'b0, + EnLsbAnd: 1'b0, + EnFpAdd: 1'b0, + EnFpMul: 1'b0, + EnFpMin: 1'b0, + EnFpMax: 1'b0, + EnIntAdd: 1'b0, + EnIntMul: 1'b0, + EnIntMinS: 1'b0, + EnIntMinU: 1'b0, + EnIntMaxS: 1'b0, + EnIntMaxU: 1'b0}, + NarrRedCfg: RedDefaultCfg, + WideRedCfg: RedDefaultCfg}}; + + + + typedef logic[31:0] axi_wide_in_addr_t; +typedef logic[255:0] axi_wide_in_data_t; +typedef logic[31:0] axi_wide_in_strb_t; +typedef logic[1:0] axi_wide_in_id_t; +typedef logic[0:0] axi_wide_in_user_t; +`AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) + + + typedef logic[31:0] axi_wide_out_addr_t; +typedef logic[255:0] axi_wide_out_data_t; +typedef logic[31:0] axi_wide_out_strb_t; +typedef logic[1:0] axi_wide_out_id_t; +typedef logic[0:0] axi_wide_out_user_t; +`AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, axi_wide_out_user_t) + + + + `FLOO_TYPEDEF_HDR_T(hdr_t, route_t, id_t, axi_ch_e, rob_idx_t) + localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, + DataWidth: 256, + InIdWidth: 2, + OutIdWidth: 2, + UserWidth: 1}; +`FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_wide_in, AxiCfg, hdr_t) + +`FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp) + + +endpackage From e9d3851dd704a9e60042ced0ee6dd943a8fe28ac Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 14 May 2026 10:16:32 +0200 Subject: [PATCH 12/37] [SW] Add print float function --- software/snRuntime/include/snrt.h | 8 ++++++++ software/snRuntime/src/printf.c | 30 ++++++++++++++++++++++++++++++ software/tests/fdotp-32b/main.c | 10 +++++++--- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/software/snRuntime/include/snrt.h b/software/snRuntime/include/snrt.h index ae91213..1d5358a 100644 --- a/software/snRuntime/include/snrt.h +++ b/software/snRuntime/include/snrt.h @@ -291,6 +291,14 @@ static inline void snrt_mutex_release(volatile uint32_t *pmtx) { dm_exit(); \ snrt_cluster_hw_barrier(); +//================================================================================ +// Printf functions +//================================================================================ + +// Print a float value without promoting to double (avoids fcvt.d.s / fsd, +// which are illegal on rv32imaf). All arithmetic stays in single precision. +extern void snrt_printf_float(float val); + #ifdef __cplusplus } #endif diff --git a/software/snRuntime/src/printf.c b/software/snRuntime/src/printf.c index 744a441..7c78245 100644 --- a/software/snRuntime/src/printf.c +++ b/software/snRuntime/src/printf.c @@ -26,3 +26,33 @@ void snrt_putchar(char character); // Include the vendorized tiny printf implementation. #include "../vendor/printf.c" + +// Print a single-precision float as a decimal string without promoting to +// double. Passing a float through a variadic (...) argument promotes it to +// double per the C standard, generating fcvt.d.s / fsd which are illegal on +// rv32imaf. This wrapper takes the value as a named argument (no promotion) +// and keeps all arithmetic in single precision. +void snrt_printf_float(float val) { + uint32_t bits; + __builtin_memcpy(&bits, &val, sizeof(uint32_t)); + + uint32_t exp = (bits >> 23) & 0xFFU; + uint32_t mant = bits & 0x7FFFFFU; + + if (exp == 0xFFU) { + if (mant != 0U) + printf("NaN"); + else + printf("%sInf", (bits >> 31) ? "-" : ""); + return; + } + + if (bits >> 31) { + _putchar('-'); + val = -val; + } + + uint32_t int_part = (uint32_t)val; + uint32_t frac_part = (uint32_t)((val - (float)int_part) * 1000000.0f); + printf("%u.%06u", int_part, frac_part); +} diff --git a/software/tests/fdotp-32b/main.c b/software/tests/fdotp-32b/main.c index 0592ff3..558ed2c 100644 --- a/software/tests/fdotp-32b/main.c +++ b/software/tests/fdotp-32b/main.c @@ -27,7 +27,7 @@ int main() { const uint32_t num_cores = snrt_cluster_core_num(); const uint32_t cid = snrt_cluster_core_idx(); - const uint32_t measure_iter = 3; + const uint32_t measure_iter = 1; /*** DRAM Parameters for Optimization ***/ const uint32_t l2_interleave = 16; @@ -56,7 +56,7 @@ int main() { } else { if (cid == 0) { printf("FATAL: Problem size too small!\n"); - return 0; + return -2; } } @@ -126,7 +126,7 @@ int main() { else if (lmul >= 1) acc = fdotp_v32b_lmul1(a_int, b_int, elem_jump_per_round, elem_per_round, rounds); else - return 0; + return -3; result[cid] = acc; @@ -176,6 +176,10 @@ int main() { if (cid == 0) { if (fp_check(result[0], dotp_result*measure_iter)) { printf("Check Failed!\n"); + printf("Calc:"); snrt_printf_float(result[0]); + printf(", Exp:"); snrt_printf_float((float)(dotp_result * measure_iter)); + printf("\n"); + return -1; } } From 72df8d54438f23ab3851c81e4a68fcf7003892f2 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 14 May 2026 11:08:19 +0200 Subject: [PATCH 13/37] [Periph] Switch back to fully-shared mode. --- config/cachepool.hjson | 6 +-- .../cachepool_peripheral_reg.hjson | 2 +- .../cachepool_peripheral_reg_top.sv | 2 +- hardware/src/cachepool_group_noc_wrapper.sv | 48 ------------------- hardware/src/tcdm_cache_interco.sv | 16 ------- 5 files changed, 5 insertions(+), 69 deletions(-) diff --git a/config/cachepool.hjson b/config/cachepool.hjson index 2ac0947..652b4b7 100644 --- a/config/cachepool.hjson +++ b/config/cachepool.hjson @@ -53,11 +53,11 @@ register_offload_rsp: true }, - nr_tiles: 4, + nr_tiles: 8, - // Repeat the compute core template N times (driven by 16) + // Repeat the compute core template N times (driven by 32) cores: [ - { $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" } + { $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" } ], icache: { diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson b/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson index 068cce3..79d7cda 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg.hjson @@ -232,7 +232,7 @@ desc: '''Number of private banks configured per tile ''' swaccess: "rw", hwaccess: "hro", - resval: "4", + resval: "0", fields: [{ bits: "3:0", name: "NUMBER", diff --git a/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv b/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv index adef765..c6ece73 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv @@ -517,7 +517,7 @@ module cachepool_peripheral_reg_top #( prim_subreg #( .DW (4), .SWACCESS("RW"), - .RESVAL (4'h4) + .RESVAL (4'h0) ) u_l1d_private ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv index b62ca36..c4bd306 100644 --- a/hardware/src/cachepool_group_noc_wrapper.sv +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -318,24 +318,6 @@ module cachepool_group_noc_wrapper assign packed_req_valid[J] = mst_xbar_req_valid[J]; assign mst_xbar_req_ready[J] = packed_req_ready[J]; -`ifndef TARGET_SYNTHESIS - initial begin - #100; - $display("[NOC_DBG] group(%0d,%0d) port J=%0d: dyn_off=%0d CacheBankBits=%0d TileWidth=%0d XWidth=%0d YWidth=%0d", - group_xy_id_i.x, group_xy_id_i.y, J, - dynamic_offset_i, NocCacheBankBits, NocAddrTileWidth, NocAddrXWidth, NocAddrYWidth); - end - always @(posedge clk_i) begin - if (packed_req_valid[J] && packed_req_ready[J]) begin - $display("[NOC_INJ] t=%0t dyn_off=%0d group(%0d,%0d) J=%0d addr=0x%08x dst(%0d,%0d) src(%0d,%0d)", - $time, dynamic_offset_i, - group_xy_id_i.x, group_xy_id_i.y, J, - mst_xbar_req[J].addr, - packed_req[J].hdr.dst_id.x, packed_req[J].hdr.dst_id.y, - packed_req[J].hdr.src_id.x, packed_req[J].hdr.src_id.y); - end - end -`endif end end : gen_mst_t @@ -443,22 +425,6 @@ module cachepool_group_noc_wrapper * NumRemoteGroupPortTile + eject_req[j].hdr.src_port_id); -`ifndef TARGET_SYNTHESIS - always @(posedge clk_i) begin - if (eject_req_valid[j] && eject_req_ready[j]) begin - $display("[SLV_EJECT_OK] t=%0t group(%0d,%0d) j=%0d addr=0x%08x tile_bits=%0d sel=%0d", - $time, group_xy_id_i.x, group_xy_id_i.y, j, - eject_req[j].payload.addr, - eject_req[j].payload.addr[(dynamic_offset_i + NocCacheBankBits) +: NocAddrTileWidth], - slv_xbar_slv_sel[j]); - end - if (eject_req_valid[j] && !eject_req_ready[j]) begin - $display("[SLV_EJECT_STALL] t=%0t group(%0d,%0d) j=%0d addr=0x%08x sel=%0d", - $time, group_xy_id_i.x, group_xy_id_i.y, j, - eject_req[j].payload.addr, slv_xbar_slv_sel[j]); - end - end -`endif end assign inject_rsp = slv_xbar_slv_rsp; @@ -522,20 +488,6 @@ module cachepool_group_noc_wrapper remote_group_rsp_from_group[SLV].q_ready; assign remote_group_req_to_group[SLV].p_ready = slv_xbar_mst_rsp_ready[J]; -`ifndef TARGET_SYNTHESIS - always @(posedge clk_i) begin - if (slv_xbar_mst_req_valid[J] && slv_xbar_mst_req_ready[J]) begin - $display("[SLV_DELIVER_OK] t=%0t group(%0d,%0d) J=%0d SLV=%0d addr=0x%08x", - $time, group_xy_id_i.x, group_xy_id_i.y, J, SLV, - slv_xbar_mst_req[J].payload.addr); - end - if (slv_xbar_mst_req_valid[J] && !slv_xbar_mst_req_ready[J]) begin - $display("[SLV_DELIVER_STALL] t=%0t group(%0d,%0d) J=%0d SLV=%0d l1d_busy=%0b q_ready=%0b", - $time, group_xy_id_i.x, group_xy_id_i.y, J, SLV, - l1d_busy_i[t], remote_group_rsp_from_group[SLV].q_ready); - end - end -`endif assign slv_xbar_mst_rsp[J].payload = remote_group_rsp_from_group[SLV].p; diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv index b989b08..13e2019 100644 --- a/hardware/src/tcdm_cache_interco.sv +++ b/hardware/src/tcdm_cache_interco.sv @@ -536,21 +536,5 @@ module tcdm_cache_interco #( assign mem_rsp_ready_o = mem_rsp_ready; -`ifndef TARGET_SYNTHESIS - // DEBUG: print inter-group request handshakes with decoded group ID - for (genvar rg = 0; rg < NumRemoteGroupPort; rg++) begin : gen_dbg_rg - localparam int unsigned P = NumCache + NumRemotePort + rg; - always @(posedge clk_i) begin - if (mem_req_valid[P] && mem_req_ready[P]) begin - $display("[INTERCO_RG] t=%0t tile=%0d rg=%0d dyn_off=%0d addr=0x%08x tile_id_field=%0d group_id=%0d CacheBankBits=%0d LocalTileBits=%0d GroupBits=%0d", - $time, tile_id_i, rg, dynamic_offset_i, - mem_req[P].addr, - mem_req[P].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth], - mem_req[P].addr[(dynamic_offset_i + CacheBankBits + LocalTileBits) +: (TileIDWidth - LocalTileBits)], - CacheBankBits, LocalTileBits, TileIDWidth - LocalTileBits); - end - end - end -`endif endmodule From 9ffe75ffc54653e85c2b23ede58ccaccf4e00aaa Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 14 May 2026 17:35:07 +0200 Subject: [PATCH 14/37] Code Cleaning. --- hardware/src/tcdm_cache_interco.sv | 96 +++++++++++++----------------- software/tests/fdotp-32b/main.c | 27 ++++++--- software/tests/gemv/main.c | 10 +++- 3 files changed, 71 insertions(+), 62 deletions(-) diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv index 13e2019..199edd3 100644 --- a/hardware/src/tcdm_cache_interco.sv +++ b/hardware/src/tcdm_cache_interco.sv @@ -46,7 +46,7 @@ module tcdm_cache_interco #( parameter int unsigned NumCores = 32'd0, /// Number of remote ports added to xbar for intra-group traffic ('>= 0'). parameter int unsigned NumRemotePort = 32'd0, - /// Number of dedicated inter-group inter-group remote ports ('>= 0'). + /// Number of dedicated inter-group remote ports ('>= 0'). /// When 0, the module behaves identically to the single-group configuration. /// Each inter-group remote port serves as both an output (requests to other groups) and an /// input (requests arriving from other groups), mirroring NumRemotePort. @@ -82,7 +82,9 @@ module tcdm_cache_interco #( parameter snitch_pkg::topo_e Topology = snitch_pkg::LogarithmicInterconnect, /// Dependency parameters – do not override. parameter type tile_id_t = logic [TileIDWidth-1:0], - parameter type addr_t = logic [AddrWidth-1:0] + parameter type addr_t = logic [AddrWidth-1:0], + localparam TotInPorts = NumCores+NumRemotePort+NumRemoteGroupPort, + localparam TotOutPorts = NumCache+NumRemotePort+NumRemoteGroupPort ) ( /// Clock, positive edge triggered. @@ -98,12 +100,12 @@ module tcdm_cache_interco #( input logic [$clog2(NumCache):0] num_private_cache_i, /// Partitioning address input addr_t private_start_addr_i, - /// Request port (cores + intra-group remote-in + inter-group inter-group remote-in) ---- - input tcdm_req_t [NumCores+NumRemotePort+NumRemoteGroupPort-1:0] core_req_i, + /// Request port (cores + intra-group remote-in + inter-group remote-in). + input tcdm_req_t [TotInPorts-1:0] core_req_i, /// Response ready in. - input logic [NumCores+NumRemotePort+NumRemoteGroupPort-1:0] core_rsp_ready_i, - /// Response port (cores + intra-group remote-in + inter-group inter-group remote-in). - output tcdm_rsp_t [NumCores+NumRemotePort+NumRemoteGroupPort-1:0] core_rsp_o, + input logic [TotInPorts-1:0] core_rsp_ready_i, + /// Response port (cores + intra-group remote-in + inter-group remote-in). + output tcdm_rsp_t [TotInPorts-1:0] core_rsp_o, /// Memory side ------------------------------------------------------- /// Which remote tile is targeted (one entry per intra-group remote output). output tile_id_t [NumRemotePort-1:0] tile_sel_o, @@ -111,13 +113,13 @@ module tcdm_cache_interco #( /// Carries the full globally-unique tile ID; the wrapper decomposes it /// into group XY coordinates for the router and local tile ID for the /// receiving-side xbar. - output tile_id_t [NumRemoteGroupPort-1:0] remote_group_sel_o, - /// Requests to cache banks, intra-group remote, and inter-group inter-group remote ports. - output tcdm_req_t [NumCache+NumRemotePort+NumRemoteGroupPort-1:0] mem_req_o, + output tile_id_t [NumRemoteGroupPort-1:0] remote_group_sel_o, + /// Requests to cache banks, intra-group remote, and inter-group remote ports. + output tcdm_req_t [TotOutPorts-1:0] mem_req_o, /// Response ready out. - output logic [NumCache+NumRemotePort+NumRemoteGroupPort-1:0] mem_rsp_ready_o, - /// Responses from cache banks, intra-group remote, and inter-group inter-group remote ports. - input tcdm_rsp_t [NumCache+NumRemotePort+NumRemoteGroupPort-1:0] mem_rsp_i + output logic [TotOutPorts-1:0] mem_rsp_ready_o, + /// Responses from cache banks, intra-group remote, and inter-group remote ports. + input tcdm_rsp_t [TotOutPorts-1:0] mem_rsp_i ); // ------------------------------------------------------------------------- @@ -135,13 +137,12 @@ module tcdm_cache_interco #( localparam int unsigned CacheBankBits = $clog2(NumCache); // Bits needed to select the tile in the shared address space. // Equals TileIDWidth by construction (NumTotCache / NumCache == NumTotalTiles). - localparam int unsigned TileBits = $clog2(NumTotCache / NumCache); + localparam int unsigned TileBits = $clog2(NumTotCache / NumCache); // Group extraction: number of bits to identify the group within TileID. - // GroupBits = TileBits - LocalTileBits, where LocalTileBits = $clog2(NumTilesPerGroup). + // LocalTileBits = $clog2(NumTilesPerGroup); GroupBits = TileBits - LocalTileBits. // Only meaningful when NumRemoteGroupPort > 0. localparam int unsigned LocalTileBits = $clog2(NumTilesPerGroup); - localparam int unsigned GroupBits = TileBits - LocalTileBits; // ------------------------------------------------------------------------- // Types @@ -157,8 +158,6 @@ module tcdm_cache_interco #( // Xbar routing signals. core_sel_t [NumInp-1:0] core_req_sel; mem_sel_t [NumOut-1:0] mem_rsp_sel; - // '1' when this request stays on local banks. - logic [NumInp-1:0] local_sel; // '1' when a request targets the private partition. logic [NumInp-1:0] is_private; @@ -207,33 +206,33 @@ module tcdm_cache_interco #( // ------------------------------------------------------------------------- reqrsp_xbar #( - .NumInp (NumInp ), - .NumOut (NumOut ), + .NumInp (NumInp ), + .NumOut (NumOut ), .PipeReg (1'b0 ), .ExtReqPrio (1'b0 ), .ExtRspPrio (1'b0 ), .tcdm_req_chan_t (tcdm_req_chan_t ), .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) ) i_cache_xbar ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .slv_req_i (core_req ), - .slv_rr_i ('0 ), - .slv_req_valid_i (core_req_valid ), - .slv_req_ready_o (core_req_ready ), - .slv_rsp_o (core_rsp ), - .slv_rsp_valid_o (core_rsp_valid ), - .slv_rsp_ready_i (core_rsp_ready ), - .slv_sel_i (core_req_sel ), - .slv_selected_o (/* unused */ ), - .mst_req_o (mem_req ), - .mst_rr_i ('0 ), - .mst_req_valid_o (mem_req_valid ), - .mst_req_ready_i (mem_req_ready ), - .mst_rsp_i (mem_rsp ), - .mst_rsp_valid_i (mem_rsp_valid ), - .mst_rsp_ready_o (mem_rsp_ready ), - .mst_sel_i (mem_rsp_sel ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (core_req ), + .slv_rr_i ('0 ), + .slv_req_valid_i (core_req_valid ), + .slv_req_ready_o (core_req_ready ), + .slv_rsp_o (core_rsp ), + .slv_rsp_valid_o (core_rsp_valid ), + .slv_rsp_ready_i (core_rsp_ready ), + .slv_sel_i (core_req_sel ), + .slv_selected_o (/* unused */ ), + .mst_req_o (mem_req ), + .mst_rr_i ('0 ), + .mst_req_valid_o (mem_req_valid ), + .mst_req_ready_i (mem_req_ready ), + .mst_rsp_i (mem_rsp ), + .mst_rsp_valid_i (mem_rsp_valid ), + .mst_rsp_ready_o (mem_rsp_ready ), + .mst_sel_i (mem_rsp_sel ) ); // ------------------------------------------------------------------------- @@ -280,11 +279,10 @@ module tcdm_cache_interco #( always_comb begin // Defaults. - local_sel[port] = 1'b1; core_req_sel[port] = '0; // Extract the raw BankSel field from the address. - addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits]; + addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits]; // Extract the full tile ID (group + local) from the address. addr_tile_id = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; // Extract group portion (upper bits of tile ID). @@ -296,24 +294,20 @@ module tcdm_cache_interco #( || (NumTiles == 1 && NumRemoteGroupPort == 0)) begin // All-private, or single-tile single-group: every request is local. // Use the full BankSel field directly (no folding needed). - local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank); end else if (num_private_cache_q == '0) begin // All-shared: full three-way classification. if (NumRemoteGroupPort > 0 && !same_group) begin // Inter-group: route to inter-group remote port. - local_sel[port] = 1'b0; core_req_sel[port] = core_sel_t'(NumCache + NumRemotePort + (port % NumRemoteGroupPort)); end else if (addr_tile_id[LocalTileBits-1:0] != tile_id_i[LocalTileBits-1:0] && !(NumTiles == 1)) begin // Intra-group remote: different tile, same group. - local_sel[port] = 1'b0; core_req_sel[port] = core_sel_t'(NumCache + (port % NumRemotePort)); end else begin // Local: same tile. - local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank); end @@ -321,23 +315,19 @@ module tcdm_cache_interco #( // Mixed partition: fold addr_bank into the appropriate partition. if (is_private[port]) begin // Private request: always local. - local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank % num_private_cache_q); end else begin // Shared request: three-way classification. if (NumRemoteGroupPort > 0 && !same_group) begin // Inter-group: route to inter-group remote port. - local_sel[port] = 1'b0; core_req_sel[port] = core_sel_t'(NumCache + NumRemotePort + (port % NumRemoteGroupPort)); end else if (addr_tile_id[LocalTileBits-1:0] != tile_id_i[LocalTileBits-1:0] && !(NumTiles == 1)) begin // Intra-group remote: different tile, same group. - local_sel[port] = 1'b0; core_req_sel[port] = core_sel_t'(NumCache + (port % NumRemotePort)); end else begin // Local: same tile. - local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(num_private_cache_q + (addr_bank % num_shared_cache_q)); end @@ -352,7 +342,7 @@ module tcdm_cache_interco #( // // Responses from local cache banks are routed back to the originating // core using core_id. Responses from intra-group remote tiles and - // inter-group inter-group remote ports carry a tile_id that differs from tile_id_i; + // inter-group remote ports carry a tile_id that differs from tile_id_i; // these are forwarded to the corresponding remote-in or inter-group remote-in port. for (genvar port = 0; port < NumOut; port++) begin : gen_rsp_sel @@ -521,11 +511,11 @@ module tcdm_cache_interco #( end else if (port < NumCache + NumRemotePort) begin // Intra-group remote port: pass address untouched; extract target tile ID. tile_sel_o[port - NumCache] = - mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; + mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; end else begin - // Inter-group inter-group remote port: pass address untouched; extract target tile ID. + // Inter-group remote port: pass address untouched; extract target tile ID. remote_group_sel_o[port - NumCache - NumRemotePort] = - mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; + mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; end end diff --git a/software/tests/fdotp-32b/main.c b/software/tests/fdotp-32b/main.c index 558ed2c..844bb6c 100644 --- a/software/tests/fdotp-32b/main.c +++ b/software/tests/fdotp-32b/main.c @@ -27,7 +27,7 @@ int main() { const uint32_t num_cores = snrt_cluster_core_num(); const uint32_t cid = snrt_cluster_core_idx(); - const uint32_t measure_iter = 1; + const uint32_t measure_iter = 3; /*** DRAM Parameters for Optimization ***/ const uint32_t l2_interleave = 16; @@ -143,11 +143,22 @@ int main() { stop_kernel(); } - // Final reduction + // Final reduction: two-level tree with group size 4 + const uint32_t red_group = 4; + + // Level 1: lead core of each group accumulates its group + if (cid % red_group == 0) { + for (uint32_t i = 1; i < red_group && (cid + i) < num_cores; ++i) + acc += result[cid + i]; + result[cid] = acc; + } + + snrt_cluster_hw_barrier(); + + // Level 2: core 0 sums all group results if (cid == 0) { - // timer_tmp = benchmark_get_cycle() - timer_tmp; - for (uint32_t i = 1; i < num_cores; ++i) - acc += result[i]; + for (uint32_t g = red_group; g < num_cores; g += red_group) + acc += result[g]; result[0] = acc; } @@ -176,8 +187,10 @@ int main() { if (cid == 0) { if (fp_check(result[0], dotp_result*measure_iter)) { printf("Check Failed!\n"); - printf("Calc:"); snrt_printf_float(result[0]); - printf(", Exp:"); snrt_printf_float((float)(dotp_result * measure_iter)); + printf("Calc:"); + snrt_printf_float(result[0]); + printf(", Exp:"); + snrt_printf_float((float)(dotp_result * measure_iter)); printf("\n"); return -1; } diff --git a/software/tests/gemv/main.c b/software/tests/gemv/main.c index 6fb3bd0..d136caa 100644 --- a/software/tests/gemv/main.c +++ b/software/tests/gemv/main.c @@ -62,8 +62,10 @@ int main() { // Allocate the matrices if (cid == 0) { - // Set xbar policy + // We use all-private mode for this kernel l1d_xbar_config(offset); + l1d_init(0); + l1d_part(4); } // Reset timer @@ -126,7 +128,11 @@ int main() { for (uint32_t j = 0; j < gemv_l.M; j++) { if (fp_check(&result[j], &gemv_result[j])) { - printf("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]); + printf("Error: ID: %i Calc", i); + snrt_printf_float(result[i]); + printf(",Exp:"); + snrt_printf_float(gemv_result[i]); + printf("\n"); } } } From c193405c4a128341a66e0f60b00692d004890f68 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 15 May 2026 10:28:29 +0200 Subject: [PATCH 15/37] [CFG] Rename configuration --- Makefile | 2 +- config/cachepool_128.mk | 96 ------------------- config/{cachepool_512.mk => cachepool_2g.mk} | 16 ++-- config/cachepool_fpu_128.mk | 96 ------------------- ...chepool_fpu_512.mk => cachepool_fpu_2g.mk} | 0 ...chepool_fpu_256.mk => cachepool_fpu_4g.mk} | 20 ++-- util/auto-benchmark/configs-ci.sh | 5 +- util/auto-benchmark/configs.sh | 6 +- 8 files changed, 26 insertions(+), 215 deletions(-) delete mode 100644 config/cachepool_128.mk rename config/{cachepool_512.mk => cachepool_2g.mk} (90%) delete mode 100644 config/cachepool_fpu_128.mk rename config/{cachepool_fpu_512.mk => cachepool_fpu_2g.mk} (100%) rename config/{cachepool_fpu_256.mk => cachepool_fpu_4g.mk} (88%) diff --git a/Makefile b/Makefile index 3264af7..f11d183 100644 --- a/Makefile +++ b/Makefile @@ -67,7 +67,7 @@ CACHE_PATH := $(shell [ -x "$(BENDER)" ] && $(BENDER) path insitu-cac # Configurations CFG_DIR ?= ${CACHEPOOL_DIR}/config -config ?= cachepool_512 +config ?= cachepool_fpu_2g # Compiler choice for SW cmake COMPILER ?= llvm diff --git a/config/cachepool_128.mk b/config/cachepool_128.mk deleted file mode 100644 index df52dab..0000000 --- a/config/cachepool_128.mk +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2025 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -# Author: Diyou Shen, ETH Zurich - -######################### -## CachePool Cluster ## -######################### - -# Number of tiles -num_tiles ?= 4 - -# Number of cores -num_cores ?= 16 - -# Core datawidth -data_width ?= 32 - -# Core addrwidth -addr_width ?= 32 - - -###################### -## CachePool Tile ## -###################### - -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - -# Refill interconnection data width -refill_data_width ?= 128 - -##### L1 Data Cache ##### - -# L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 128 - -# L1 data cache size (in KiB) -l1d_size ?= 256 - -# L1 data cache banking factor (how many banks per core?) -l1d_bank_factor ?= 1 - -# L1 coalecsing window -l1d_coal_window ?= 2 - -# L1 data cache number of ways per -l1d_num_way ?= 4 - -# L1 data cache size per tile (KiB) -l1d_tile_size ?= 256 - -# L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 52 - -#################### -## CachePool CC ## -#################### -# Spatz fpu support? -spatz_fpu_en ?= 0 - -# Spatz number of FPU -spatz_num_fpu ?= 0 - -# Spatz number of IPU -spatz_num_ipu ?= 4 - -# Spatz max outstanding transactions -spatz_max_trans ?= 32 - -# Snitch/FPU max outstanding transactions -snitch_max_trans ?= 16 - - -##################### -## L2 Main Memory ## -##################### -# L2 number of channels -l2_channel ?= 4 - -# L2 bank width (DRAM width, change with care) -l2_bank_width ?= 512 - -# L2 interleaving factor (in order of bank_width) -l2_interleave ?= 16 - - -################## -## Peripherals ## -################## -# Hardware stack size (in Byte) -stack_hw_size ?= 1024 - -# Stack size (total, including share and private, 32'h800) -stack_tot_size ?= 2048 diff --git a/config/cachepool_512.mk b/config/cachepool_2g.mk similarity index 90% rename from config/cachepool_512.mk rename to config/cachepool_2g.mk index 6d04a68..b58155c 100644 --- a/config/cachepool_512.mk +++ b/config/cachepool_2g.mk @@ -8,11 +8,14 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 2 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,14 +23,15 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 @@ -48,7 +52,7 @@ l1d_coal_window ?= 2 # L1 data cache number of ways per l1d_num_way ?= 4 -# L1 data cache size **per tile** (KiB) +# L1 data cache size per tile (KiB) l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) diff --git a/config/cachepool_fpu_128.mk b/config/cachepool_fpu_128.mk deleted file mode 100644 index e60aad4..0000000 --- a/config/cachepool_fpu_128.mk +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2025 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -# Author: Diyou Shen, ETH Zurich - -######################### -## CachePool Cluster ## -######################### - -# Number of tiles -num_tiles ?= 4 - -# Number of cores -num_cores ?= 16 - -# Core datawidth -data_width ?= 32 - -# Core addrwidth -addr_width ?= 32 - - -###################### -## CachePool Tile ## -###################### - -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - -# Refill interconnection data width -refill_data_width ?= 128 - -##### L1 Data Cache ##### - -# L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 128 - -# L1 data cache size (in KiB) -l1d_size ?= 256 - -# L1 data cache banking factor (how many banks per core?) -l1d_bank_factor ?= 1 - -# L1 coalecsing window -l1d_coal_window ?= 1 - -# L1 data cache number of ways per -l1d_num_way ?= 4 - -# L1 data cache size per tile (KiB) -l1d_tile_size ?= 256 - -# L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 64 - -#################### -## CachePool CC ## -#################### -# Spatz fpu support? -spatz_fpu_en ?= 1 - -# Spatz number of FPU -spatz_num_fpu ?= 4 - -# Spatz number of IPU -spatz_num_ipu ?= 4 - -# Spatz max outstanding transactions -spatz_max_trans ?= 32 - -# Snitch/FPU max outstanding transactions -snitch_max_trans ?= 16 - - -##################### -## L2 Main Memory ## -##################### -# L2 number of channels -l2_channel ?= 4 - -# L2 bank width (DRAM width, change with care) -l2_bank_width ?= 512 - -# L2 interleaving factor (in order of bank_width) -l2_interleave ?= 16 - - -################## -## Peripherals ## -################## -# Hardware stack size (in Byte) -stack_hw_size ?= 1024 - -# Stack size (total, including share and private, 32'h800) -stack_tot_size ?= 2048 diff --git a/config/cachepool_fpu_512.mk b/config/cachepool_fpu_2g.mk similarity index 100% rename from config/cachepool_fpu_512.mk rename to config/cachepool_fpu_2g.mk diff --git a/config/cachepool_fpu_256.mk b/config/cachepool_fpu_4g.mk similarity index 88% rename from config/cachepool_fpu_256.mk rename to config/cachepool_fpu_4g.mk index 279dc80..a9a5458 100644 --- a/config/cachepool_fpu_256.mk +++ b/config/cachepool_fpu_4g.mk @@ -8,11 +8,14 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 4 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,21 +23,22 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 ##### L1 Data Cache ##### # L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 256 +l1d_cacheline_width ?= 512 # L1 data cache size (in KiB) l1d_size ?= 256 @@ -52,7 +56,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 64 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## @@ -83,7 +87,7 @@ l2_channel ?= 4 l2_bank_width ?= 512 # L2 interleaving factor (in order of bank_width) -l2_interleave ?= 8 +l2_interleave ?= 16 ################## diff --git a/util/auto-benchmark/configs-ci.sh b/util/auto-benchmark/configs-ci.sh index 3659ad2..1801682 100644 --- a/util/auto-benchmark/configs-ci.sh +++ b/util/auto-benchmark/configs-ci.sh @@ -1,6 +1,5 @@ # Configs and kernel suffixes (without prefix) -CONFIGS="cachepool_fpu_512" -KERNELS="fdotp-32b_M32768 gemv-opt_M1024_N128_K32 fmatmul-32b_M64_N64_K64" -# KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv-opt_M512_N128_K32 fmatmul-32b_M32_N32_K32 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" +CONFIGS="cachepool_fpu_2g" +KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv_M512_N128_K32 fmatmul-32b_M32_N32_K32 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" PREFIX="test-cachepool-" # common prefix for all kernels ROOT_PATH=../.. # adjust if needed (path to repo root) diff --git a/util/auto-benchmark/configs.sh b/util/auto-benchmark/configs.sh index 1a545be..bb8fa6c 100755 --- a/util/auto-benchmark/configs.sh +++ b/util/auto-benchmark/configs.sh @@ -1,12 +1,8 @@ # Configs and kernel suffixes (without prefix) -# CONFIGS="cachepool_fpu_512" -CONFIGS="cachepool_fpu_128 cachepool_fpu_256 cachepool_fpu_512" +CONFIGS="cachepool_fpu_2g cachepool_fpu_4g" -# KERNELS="spin-lock fdotp-32b_M8192 fmatmul-32b_M32_N32_K32" -# KERNELS="fdotp-32b_M65536 gemv-opt_M1024_N128_K32 gemv_M1024_N128_K32" KERNELS="spin-lock fdotp-32b_M65536 gemv-opt_M1024_N128_K32 gemv_M1024_N128_K32 fmatmul-32b_M64_N64_K64 multi_producer_single_consumer_double_linked_list_M1_N1350_K100 byte-enable" -# KERNELS="spin-lock fdotp-32b_M32768" PREFIX="test-cachepool-" # common prefix for all kernels ROOT_PATH=../.. # adjust if needed (path to repo root) From d748984f0e485a6da93c635e92eb8382af2b7ff1 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 15 May 2026 10:29:56 +0200 Subject: [PATCH 16/37] [TB][SW] Adjust cache initialization and cache flush flow to avoid data corruption at boot time and xbar reconfiguration time. --- hardware/tb/tb_cachepool.sv | 66 +++++++++++++++++++++++++++++++ software/snRuntime/src/l1cache.c | 48 +++++++++++----------- software/snRuntime/src/start.S | 8 ++++ software/tests/byte-enable/main.c | 1 - software/tests/fdotp-32b/main.c | 2 - software/tests/fmatmul-32b/main.c | 7 +--- software/tests/gemv/main.c | 1 - software/tests/idotp-32b/main.c | 2 - software/tests/load-store/main.c | 1 - 9 files changed, 102 insertions(+), 34 deletions(-) diff --git a/hardware/tb/tb_cachepool.sv b/hardware/tb/tb_cachepool.sv index 7da3e78..c1276bf 100644 --- a/hardware/tb/tb_cachepool.sv +++ b/hardware/tb/tb_cachepool.sv @@ -211,6 +211,72 @@ module tb_cachepool; to_cluster_req = '0; + // Initialize L1D cache before waking up cores + // Step 1: Write init instruction (flush + invalidate) + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_CFG_L1D_INSN_OFFSET, + data : 32'h3, + write : 1'b1, + strb : '1, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + + // Step 2: Commit the instruction + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_L1D_INSN_COMMIT_OFFSET, + data : 32'h1, + write : 1'b1, + strb : '1, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + + // Step 3: Poll until flush complete + begin + automatic logic [31:0] flush_status; + do begin + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_L1D_FLUSH_STATUS_OFFSET, + write : 1'b0, + strb : '0, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + flush_status = to_cluster_rsp.p.data; + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + end while (flush_status[0]); + end + // Wake up cores debug_req = '1; @(posedge clk); diff --git a/software/snRuntime/src/l1cache.c b/software/snRuntime/src/l1cache.c index 79700d1..c52803c 100644 --- a/software/snRuntime/src/l1cache.c +++ b/software/snRuntime/src/l1cache.c @@ -20,6 +20,9 @@ void l1d_xbar_config(uint32_t offset) { (uint32_t *)(_snrt_team_current->root->cluster_mem.end + CACHEPOOL_PERIPHERAL_XBAR_OFFSET_REG_OFFSET); *cfg = offset; + // Flush cache before commit xbar changes + l1d_flush(); + l1d_wait(); l1d_xbar_commit(); } @@ -96,28 +99,29 @@ void l1d_wait() { } } -void l1d_spm_config (uint32_t size) { - // flush the cache before reconfiguration - l1d_flush(); - l1d_wait(); - // free all allocated region - snrt_l1alloc_reset(); - // set the pointers - volatile uint32_t *cfg_size = - (uint32_t *)(_snrt_team_current->root->cluster_mem.end + - CACHEPOOL_PERIPHERAL_CFG_L1D_SPM_REG_OFFSET); - volatile uint32_t *commit = - (uint32_t *)(_snrt_team_current->root->cluster_mem.end + - CACHEPOOL_PERIPHERAL_L1D_SPM_COMMIT_REG_OFFSET); - // Make sure dummy region will not be optimized away - volatile double *dummy; - // Should be (L1_size - size) * 128 - int cache_region = (128 - size) * 128; - dummy = (volatile double *)snrt_l1alloc(cache_region * sizeof(double)); - // change size and commit the change - *cfg_size = size; - *commit = 1; -} +// Used for hybrid SPM/cache, unused in CachePool now +// void l1d_spm_config (uint32_t size) { +// // flush the cache before reconfiguration +// l1d_flush(); +// l1d_wait(); +// // free all allocated region +// snrt_l1alloc_reset(); +// // set the pointers +// volatile uint32_t *cfg_size = +// (uint32_t *)(_snrt_team_current->root->cluster_mem.end + +// CACHEPOOL_PERIPHERAL_CFG_L1D_SPM_REG_OFFSET); +// volatile uint32_t *commit = +// (uint32_t *)(_snrt_team_current->root->cluster_mem.end + +// CACHEPOOL_PERIPHERAL_L1D_SPM_COMMIT_REG_OFFSET); +// // Make sure dummy region will not be optimized away +// volatile double *dummy; +// // Should be (L1_size - size) * 128 +// int cache_region = (128 - size) * 128; +// dummy = (volatile double *)snrt_l1alloc(cache_region * sizeof(double)); +// // change size and commit the change +// *cfg_size = size; +// *commit = 1; +// } // Used to configure the number of private cache banks per tile void l1d_part (uint32_t size) { diff --git a/software/snRuntime/src/start.S b/software/snRuntime/src/start.S index 34c0d93..7b2ce87 100644 --- a/software/snRuntime/src/start.S +++ b/software/snRuntime/src/start.S @@ -53,6 +53,14 @@ snrt.crt0.init_bss: blt t0, t1, 1b 2: +snrt.crt0.init_vec_registers: + li t0, -1 + vsetvli zero, t0, e32, m8, ta, ma + vmv.v.i v0, 0 + vmv.v.i v8, 0 + vmv.v.i v16, 0 + vmv.v.i v24, 0 + snrt.crt0.init_registers: # Clear FP registers csrr t0, misa diff --git a/software/tests/byte-enable/main.c b/software/tests/byte-enable/main.c index 7266687..3d64f07 100644 --- a/software/tests/byte-enable/main.c +++ b/software/tests/byte-enable/main.c @@ -400,7 +400,6 @@ int main(void) { const unsigned int core_id = snrt_cluster_core_idx(); if (core_id == 0) { - l1d_init(0); uint32_t offset = 31U - __builtin_clz((unsigned int)L1LineWidth); l1d_xbar_config(offset); } diff --git a/software/tests/fdotp-32b/main.c b/software/tests/fdotp-32b/main.c index 844bb6c..bf2ccb8 100644 --- a/software/tests/fdotp-32b/main.c +++ b/software/tests/fdotp-32b/main.c @@ -81,8 +81,6 @@ int main() { if (cid == 0) { // Set xbar policy l1d_xbar_config(l1_scramble_bits); - // Initialize the cache - l1d_init(0); } snrt_cluster_hw_barrier(); diff --git a/software/tests/fmatmul-32b/main.c b/software/tests/fmatmul-32b/main.c index 774b5fa..550cbb2 100644 --- a/software/tests/fmatmul-32b/main.c +++ b/software/tests/fmatmul-32b/main.c @@ -53,7 +53,7 @@ int verify_matrix(float *matrix, const float *checksum, // printf("Row: %d, Result: %x, Golden reselt: %x\n", i, print_sum, print_gold); } } - return error; + return -1; } int main() { @@ -77,10 +77,7 @@ int main() { // Set xbar policy // All cores will access the same B // Scramble based on cacheline - // l1d_xbar_config(5); l1d_xbar_config(5); - // Init the cache - l1d_init(0); } // Wait for all cores to finish @@ -137,7 +134,7 @@ int main() { } else if (kernel_size == 8) { matmul_8xVL(gemm_C_dram, gemm_A_dram, gemm_B_dram, m_start, m_end, gemm_l.K, gemm_l.N, p_start, p_end); } else { - return -2; + return -1; } // Wait for all cores to finish diff --git a/software/tests/gemv/main.c b/software/tests/gemv/main.c index d136caa..fd0ae8f 100644 --- a/software/tests/gemv/main.c +++ b/software/tests/gemv/main.c @@ -64,7 +64,6 @@ int main() { if (cid == 0) { // We use all-private mode for this kernel l1d_xbar_config(offset); - l1d_init(0); l1d_part(4); } diff --git a/software/tests/idotp-32b/main.c b/software/tests/idotp-32b/main.c index bbd21ce..0143e31 100644 --- a/software/tests/idotp-32b/main.c +++ b/software/tests/idotp-32b/main.c @@ -51,9 +51,7 @@ int main() { if (cid == 0) { // Set xbar policy - l1d_init(0); l1d_xbar_config(offset); - // Initialize the cache printf ("round:%u, lmul:%u, dim:%u\n", rounds, lmul, dim); } diff --git a/software/tests/load-store/main.c b/software/tests/load-store/main.c index 4c43947..79ccec1 100644 --- a/software/tests/load-store/main.c +++ b/software/tests/load-store/main.c @@ -105,7 +105,6 @@ static int check_const(uint32_t *ptr, uint32_t count, uint32_t value, static void cache_cfg(uint32_t cid, uint32_t xbar_offset, uint32_t part) { if (cid == 0) { l1d_xbar_config(xbar_offset); - l1d_init(0); l1d_part(part); } sync_all(); From 0b7ae6a34a534ed86106d4e3bac042074d16bd20 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 15 May 2026 11:18:59 +0200 Subject: [PATCH 17/37] [SRC] Clean code. --- hardware/src/cachepool_cluster.sv | 54 +-- hardware/src/cachepool_group.sv | 34 +- hardware/src/cachepool_group_noc_wrapper.sv | 350 ++++++++++---------- hardware/src/cachepool_tile.sv | 70 ++-- 4 files changed, 199 insertions(+), 309 deletions(-) diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index 78d84c3..70523a6 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -4,21 +4,11 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" -`include "common_cells/assertions.svh" -`include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" `include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" -`include "reqrsp_interface/typedef.svh" -`include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" -/// A single-tile cluster implementation for CachePool +/// CachePool cluster: instantiates NumGroups groups connected via FlooNoC mesh, +/// with shared L2 memory and peripheral fabric. module cachepool_cluster import cachepool_pkg::*; import spatz_pkg::*; @@ -157,20 +147,10 @@ module cachepool_cluster // Imports // --------- import snitch_pkg::*; - import snitch_icache_pkg::icache_events_t; // --------- // Constants // --------- - /// Minimum width to hold the core number. - localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); - - // Enlarge the address width for Spatz due to cache - localparam int unsigned TCDMAddrWidth = 32; - - // Core Request, SoC Request - localparam int unsigned NrNarrowMasters = 2; - localparam int unsigned WideIdWidthOut = AxiIdWidthOut; localparam int unsigned WideIdWidthIn = WideIdWidthOut - $clog2(NumClusterMst) - GroupMuxIdBits; @@ -178,23 +158,6 @@ module cachepool_cluster // The multi-group axi_mux adds GroupMuxIdBits on top to reach WideIdWidthOut. localparam int unsigned WideIdWidthPreMux = WideIdWidthOut - GroupMuxIdBits; - // Cache XBar configuration struct - localparam axi_pkg::xbar_cfg_t CacheXbarCfg = '{ - NoSlvPorts : NumClusterMst*NumTiles, - NoMstPorts : ClusterWideOutAxiPorts, - MaxMstTrans : MaxMstTrans, - MaxSlvTrans : MaxSlvTrans, - FallThrough : 1'b0, - LatencyMode : XbarLatency, - AxiIdWidthSlvPorts: WideIdWidthIn, - AxiIdUsedSlvPorts : WideIdWidthIn, - UniqueIds : 1'b0, - AxiAddrWidth : AxiAddrWidth, - AxiDataWidth : AxiDataWidth, - NoAddrRules : ClusterWideOutAxiPorts - 1, - default : '0 - }; - // -------- // Typedefs // -------- @@ -214,16 +177,6 @@ module cachepool_cluster // Pre-mux AXI types (per-group reqrsp_to_axi output, input to axi_mux). `AXI_TYPEDEF_ALL(axi_premux_cache, addr_t, id_cache_premux_t, data_cache_t, strb_cache_t, user_cache_t) - `REG_BUS_TYPEDEF_ALL(reg_cache, addr_t, data_cache_t, strb_cache_t) - - typedef struct packed { - int unsigned idx; - addr_t start_addr; - addr_t end_addr; - } xbar_rule_t; - - `SNITCH_VM_TYPEDEF(AxiAddrWidth) - // ---------------- // Wire Definitions // ---------------- @@ -240,7 +193,6 @@ module cachepool_cluster // 3. Peripherals axi_addr_t private_start_addr; - icache_events_t [NrCores-1:0] icache_events; logic icache_prefetch_enable; logic [NrCores-1:0] cl_interrupt; logic [$clog2(L1AddrWidth)-1:0] dynamic_offset; @@ -336,7 +288,7 @@ module cachepool_cluster .l2_req_o ( l2_req[g] ), .l2_rsp_i ( l2_rsp[g] ), // Peripherals - .icache_events_o ( icache_events[g*NumCoreGroup +: NumCoreGroup] ), + .icache_events_o ( /* unused */ ), .icache_prefetch_enable_i ( icache_prefetch_enable ), .cl_interrupt_i ( cl_interrupt [g*NumCoreGroup +: NumCoreGroup] ), .dynamic_offset_i ( dynamic_offset ), diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index eb89c99..734617d 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -4,19 +4,9 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" -`include "common_cells/assertions.svh" `include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" `include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" -`include "reqrsp_interface/typedef.svh" -`include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" /// Group implementation for CachePool module cachepool_group @@ -190,24 +180,9 @@ module cachepool_group // --------- // Constants // --------- - /// Minimum width to hold the core number. - localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); - // localparam int unsigned TileIDWidth = cf_math_pkg::idx_width(NumTiles); - // Per-group overrides of package-level constants that depend on NumTiles/NumCores. - localparam int unsigned NrCoresTileLocal = NrCores / NumTilesPerGroup; localparam int unsigned NumL1CacheCtrlLocal = NrCores; - localparam int unsigned NumL1CtrlTileLocal = NumL1CacheCtrlLocal / NumTilesPerGroup; - - // Enlarge the address width for Spatz due to cache - localparam int unsigned TCDMAddrWidth = L1AddrWidth; - - // Per-tile inter-group remote port count (across all interco instances). - // Core Request, SoC Request - localparam int unsigned NrNarrowMasters = 2; - - localparam int unsigned WideIdWidthOut = AxiIdWidthOut; localparam int unsigned WideIdWidthIn = AxiIdWidthOut; @@ -218,13 +193,9 @@ module cachepool_group typedef logic [AxiDataWidth-1:0] data_cache_t; typedef logic [AxiDataWidth/8-1:0] strb_cache_t; typedef logic [WideIdWidthIn-1:0] id_cache_mst_t; - typedef logic [WideIdWidthOut-1:0] id_cache_slv_t; typedef logic [AxiUserWidth-1:0] user_cache_t; `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) - `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) - - `REG_BUS_TYPEDEF_ALL(reg_cache, addr_t, data_cache_t, strb_cache_t) typedef struct packed { int unsigned idx; @@ -232,8 +203,6 @@ module cachepool_group addr_t end_addr; } xbar_rule_t; - `SNITCH_VM_TYPEDEF(AxiAddrWidth) - // --------------- // CachePool Tile // --------------- @@ -362,7 +331,6 @@ module cachepool_group tile_sel_err_t [NumTilesPerGroup*NumClusterMst-1:0] tile_sel_err; tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] tile_sel; - l2_sel_t [ClusterWideOutAxiPorts-1:0] tile_selected; l2_sel_t [ClusterWideOutAxiPorts-1:0] l2_sel; tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] l2_rsp_rr; @@ -574,7 +542,7 @@ module cachepool_group .slv_rsp_ready_i ( tile_rsp_ready ), .slv_sel_i ( tile_sel[NumTilesPerGroup*NumClusterMst-1:0] ), .slv_rr_i ( '0 ), - .slv_selected_o ( tile_selected ), + .slv_selected_o ( /* unused */ ), .mst_req_o ( l2_req_chan ), .mst_req_valid_o ( l2_req_valid ), .mst_req_ready_i ( l2_req_ready ), diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv index c4bd306..9e14914 100644 --- a/hardware/src/cachepool_group_noc_wrapper.sv +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -8,19 +8,6 @@ // // Author: Diyou Shen -`include "axi/assign.svh" -`include "axi/typedef.svh" -`include "common_cells/assertions.svh" -`include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" -`include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" -`include "reqrsp_interface/typedef.svh" -`include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" module cachepool_group_noc_wrapper import cachepool_pkg::*; @@ -72,33 +59,33 @@ module cachepool_group_noc_wrapper parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, parameter int unsigned NrSramCfg = 1 ) ( - input logic clk_i, - input logic rst_ni, - input logic [NrCores-1:0] debug_req_i, - input logic [NrCores-1:0] meip_i, - input logic [NrCores-1:0] mtip_i, - input logic [NrCores-1:0] msip_i, - input logic [9:0] hart_base_id_i, - input logic [TileIDWidth-1:0] tile_base_id_i, - input axi_addr_t cluster_base_addr_i, - input axi_addr_t private_start_addr_i, - output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, - input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, - output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, - input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, - output icache_events_t [NrCores-1:0] icache_events_o, - input logic icache_prefetch_enable_i, - input logic [NrCores-1:0] cl_interrupt_i, - input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, - input logic [3:0] l1d_private_i, - input cache_insn_t l1d_insn_i, - input logic l1d_insn_valid_i, - output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, - input logic [NumTilesPerGroup-1:0] l1d_busy_i, - input impl_in_t [NrSramCfg-1:0] impl_i, - output logic error_o, + input logic clk_i, + input logic rst_ni, + input logic [NrCores-1:0] debug_req_i, + input logic [NrCores-1:0] meip_i, + input logic [NrCores-1:0] mtip_i, + input logic [NrCores-1:0] msip_i, + input logic [9:0] hart_base_id_i, + input logic [TileIDWidth-1:0] tile_base_id_i, + input axi_addr_t cluster_base_addr_i, + input axi_addr_t private_start_addr_i, + output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, + input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, + output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, + input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, + output icache_events_t [NrCores-1:0] icache_events_o, + input logic icache_prefetch_enable_i, + input logic [NrCores-1:0] cl_interrupt_i, + input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, + input logic [3:0] l1d_private_i, + input cache_insn_t l1d_insn_i, + input logic l1d_insn_valid_i, + output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, + input logic [NumTilesPerGroup-1:0] l1d_busy_i, + input impl_in_t [NrSramCfg-1:0] impl_i, + output logic error_o, // XY coordinates of this group in the inter-group mesh - input group_xy_id_t group_xy_id_i, + input group_xy_id_t group_xy_id_i, // Inter-group req mesh: 4 directions (N=0,E=1,S=2,W=3) // dim1: direction, dim2: tile*NumNoCPortsPerTile+channel output noc_group_req_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_o, @@ -130,8 +117,6 @@ module cachepool_group_noc_wrapper // -- Struct / xbar field widths (always >= 1 to avoid zero-width ports) ------ localparam int unsigned NocCacheBankBits = $clog2(NrBanks); localparam int unsigned NocAddrTileWidth = (NumTilesPerGroup > 1) ? $clog2(NumTilesPerGroup) : 1; - localparam int unsigned NocAddrXWidth = (NumGroupsX > 1) ? $clog2(NumGroupsX) : 1; - localparam int unsigned NocAddrYWidth = (NumGroupsY > 1) ? $clog2(NumGroupsY) : 1; // -- Actual bit counts inside dst_tile_id (can be 0 when that dimension = 1) - // dst_tile_id layout: [ group_y (NocGroupBitsY) | group_x (NocGroupBitsX) | local_tile (NocGroupOffset) ] // where NocGroupOffset = $clog2(NumTilesPerGroup) (0 when NumTilesPerGroup == 1). @@ -167,21 +152,21 @@ module cachepool_group_noc_wrapper logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in_ready; for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_mesh_trans_t - for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_mesh_trans_c + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_mesh_trans_n for (genvar d = 0; d < 4; d++) begin : gen_mesh_trans_d - assign noc_req_o[d][t*NumNoCPortsPerTile+c] = req_mesh_out[t][c][d]; - assign noc_req_valid_o[d][t*NumNoCPortsPerTile+c] = req_mesh_out_valid[t][c][d]; - assign req_mesh_out_ready[t][c][d] = noc_req_ready_i[d][t*NumNoCPortsPerTile+c]; - assign req_mesh_in[t][c][d] = noc_req_i[d][t*NumNoCPortsPerTile+c]; - assign req_mesh_in_valid[t][c][d] = noc_req_valid_i[d][t*NumNoCPortsPerTile+c]; - assign noc_req_ready_o[d][t*NumNoCPortsPerTile+c] = req_mesh_in_ready[t][c][d]; - - assign noc_rsp_o[d][t*NumNoCPortsPerTile+c] = rsp_mesh_out[t][c][d]; - assign noc_rsp_valid_o[d][t*NumNoCPortsPerTile+c] = rsp_mesh_out_valid[t][c][d]; - assign rsp_mesh_out_ready[t][c][d] = noc_rsp_ready_i[d][t*NumNoCPortsPerTile+c]; - assign rsp_mesh_in[t][c][d] = noc_rsp_i[d][t*NumNoCPortsPerTile+c]; - assign rsp_mesh_in_valid[t][c][d] = noc_rsp_valid_i[d][t*NumNoCPortsPerTile+c]; - assign noc_rsp_ready_o[d][t*NumNoCPortsPerTile+c] = rsp_mesh_in_ready[t][c][d]; + assign noc_req_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out[t][n][d]; + assign noc_req_valid_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out_valid[t][n][d]; + assign req_mesh_out_ready[t][n][d] = noc_req_ready_i[d][t*NumNoCPortsPerTile+n]; + assign req_mesh_in[t][n][d] = noc_req_i[d][t*NumNoCPortsPerTile+n]; + assign req_mesh_in_valid[t][n][d] = noc_req_valid_i[d][t*NumNoCPortsPerTile+n]; + assign noc_req_ready_o[d][t*NumNoCPortsPerTile+n] = req_mesh_in_ready[t][n][d]; + + assign noc_rsp_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out[t][n][d]; + assign noc_rsp_valid_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out_valid[t][n][d]; + assign rsp_mesh_out_ready[t][n][d] = noc_rsp_ready_i[d][t*NumNoCPortsPerTile+n]; + assign rsp_mesh_in[t][n][d] = noc_rsp_i[d][t*NumNoCPortsPerTile+n]; + assign rsp_mesh_in_valid[t][n][d] = noc_rsp_valid_i[d][t*NumNoCPortsPerTile+n]; + assign noc_rsp_ready_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_in_ready[t][n][d]; end end end @@ -190,7 +175,7 @@ module cachepool_group_noc_wrapper if (NumRemoteGroupPortCore > 0) begin : gen_noc // ----------------------------------------------------------------------- - // Router inject/eject signals (flat 1D index j = t*NumNoCPortsPerTile+c) + // Router inject/eject signals (flat 1D index noc_port = t*NumNoCPortsPerTile+n) // ----------------------------------------------------------------------- noc_group_req_t [NumNoCPortsGroup-1:0] packed_req; logic [NumNoCPortsGroup-1:0] packed_req_valid; @@ -253,70 +238,70 @@ module cachepool_group_noc_wrapper remote_group_req_from_group[t*NumRemoteGroupPortTile+p].p_ready; end - for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_mst_eject_c - localparam int unsigned J = t * NumNoCPortsPerTile + c; - assign eject_rsp_payload[c] = eject_rsp[J].payload; - assign mst_xbar_mst_sel[c] = eject_rsp[J].hdr.src_port_id; + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_mst_eject_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + assign eject_rsp_payload[n] = eject_rsp[noc_port].payload; + assign mst_xbar_mst_sel[n] = eject_rsp[noc_port].hdr.src_port_id; end reqrsp_xbar #( - .NumInp ( NumRemoteGroupPortTile ), + .NumInp ( NumRemoteGroupPortTile ), .NumOut ( NumNoCPortsPerTile ), .tcdm_req_chan_t ( remote_group_req_chan_t ), .tcdm_rsp_chan_t ( remote_group_rsp_chan_t ) ) i_noc_mst_xbar ( .clk_i, .rst_ni, - .slv_req_i ( mst_slv_req ), - .slv_rr_i ( '0 ), - .slv_req_valid_i ( mst_slv_req_valid ), - .slv_req_ready_o ( mst_slv_req_ready ), - .slv_rsp_o ( mst_slv_rsp ), - .slv_rsp_valid_o ( mst_slv_rsp_valid ), - .slv_rsp_ready_i ( mst_slv_rsp_ready ), - .slv_sel_i ( '0 ), - .slv_selected_o ( mst_xbar_slv_selected ), + .slv_req_i ( mst_slv_req ), + .slv_rr_i ( '0 ), + .slv_req_valid_i ( mst_slv_req_valid ), + .slv_req_ready_o ( mst_slv_req_ready ), + .slv_rsp_o ( mst_slv_rsp ), + .slv_rsp_valid_o ( mst_slv_rsp_valid ), + .slv_rsp_ready_i ( mst_slv_rsp_ready ), + .slv_sel_i ( '0 ), + .slv_selected_o ( mst_xbar_slv_selected ), .mst_req_o ( mst_xbar_req[t*NumNoCPortsPerTile +: - NumNoCPortsPerTile] ), + NumNoCPortsPerTile] ), .mst_req_valid_o ( mst_xbar_req_valid[t*NumNoCPortsPerTile +: - NumNoCPortsPerTile] ), + NumNoCPortsPerTile] ), .mst_req_ready_i ( mst_xbar_req_ready[t*NumNoCPortsPerTile +: - NumNoCPortsPerTile] ), - .mst_rsp_i ( eject_rsp_payload ), - .mst_rr_i ( '0 ), + NumNoCPortsPerTile] ), + .mst_rsp_i ( eject_rsp_payload ), + .mst_rr_i ( '0 ), .mst_rsp_valid_i ( eject_rsp_valid[t*NumNoCPortsPerTile +: - NumNoCPortsPerTile] ), + NumNoCPortsPerTile] ), .mst_rsp_ready_o ( eject_rsp_ready[t*NumNoCPortsPerTile +: - NumNoCPortsPerTile] ), - .mst_sel_i ( mst_xbar_mst_sel ) + NumNoCPortsPerTile] ), + .mst_sel_i ( mst_xbar_mst_sel ) ); - for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_pack_c - localparam int unsigned J = t * NumNoCPortsPerTile + c; - assign packed_req[J].hdr.collective_op = '0; - assign packed_req[J].hdr.src_id = group_xy_id_i; + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_pack_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + assign packed_req[noc_port].hdr.collective_op = '0; + assign packed_req[noc_port].hdr.src_id = group_xy_id_i; // dst_tile_id set by tcdm_cache_interco: bits [NocGroupOffset +: NocGroupBitsX] = group_x, // bits [(NocGroupOffset+NocGroupBitsX) +: NocGroupBitsY] = group_y. // When a dimension has only 1 group, no bits are consumed and the coordinate is 0. if (NumGroupsX > 1) begin : gen_dst_x - assign packed_req[J].hdr.dst_id.x = - mst_xbar_req[J].user.dst_tile_id[NocGroupOffset +: NocGroupBitsX]; + assign packed_req[noc_port].hdr.dst_id.x = + mst_xbar_req[noc_port].user.dst_tile_id[NocGroupOffset +: NocGroupBitsX]; end else begin : gen_dst_x - assign packed_req[J].hdr.dst_id.x = '0; + assign packed_req[noc_port].hdr.dst_id.x = '0; end if (NumGroupsY > 1) begin : gen_dst_y - assign packed_req[J].hdr.dst_id.y = - mst_xbar_req[J].user.dst_tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; + assign packed_req[noc_port].hdr.dst_id.y = + mst_xbar_req[noc_port].user.dst_tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; end else begin : gen_dst_y - assign packed_req[J].hdr.dst_id.y = '0; + assign packed_req[noc_port].hdr.dst_id.y = '0; end - assign packed_req[J].hdr.dst_id.port_id = '0; - assign packed_req[J].hdr.src_tile_id = group_tile_sel_t'(t); - assign packed_req[J].hdr.src_port_id = mst_xbar_slv_selected[c]; - assign packed_req[J].hdr.last = 1'b1; - assign packed_req[J].payload = mst_xbar_req[J]; - assign packed_req_valid[J] = mst_xbar_req_valid[J]; - assign mst_xbar_req_ready[J] = packed_req_ready[J]; + assign packed_req[noc_port].hdr.dst_id.port_id = '0; + assign packed_req[noc_port].hdr.src_tile_id = group_tile_sel_t'(t); + assign packed_req[noc_port].hdr.src_port_id = mst_xbar_slv_selected[n]; + assign packed_req[noc_port].hdr.last = 1'b1; + assign packed_req[noc_port].payload = mst_xbar_req[noc_port]; + assign packed_req_valid[noc_port] = mst_xbar_req_valid[noc_port]; + assign mst_xbar_req_ready[noc_port] = packed_req_ready[noc_port]; end @@ -327,8 +312,8 @@ module cachepool_group_noc_wrapper // Per-tile per-channel req floo_router // ----------------------------------------------------------------------- for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_req_router_t - for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_req_router_c - localparam int unsigned J = t * NumNoCPortsPerTile + c; + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_req_router_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; floo_router #( .NumRoutes ( 5 ), .NumVirtChannels ( 1 ), @@ -345,25 +330,25 @@ module cachepool_group_noc_wrapper ) i_req_router ( .clk_i, .rst_ni, - .test_enable_i ( 1'b0 ), - .xy_id_i ( group_xy_id_i ), - .id_route_map_i ( '0 ), - .valid_i ( {packed_req_valid[J], - req_mesh_in_valid[t][c][3:0]} ), - .ready_o ( {packed_req_ready[J], - req_mesh_in_ready[t][c][3:0]} ), - .data_i ( {packed_req[J], - req_mesh_in[t][c][3:0]} ), - .credit_o ( ), - .valid_o ( {eject_req_valid[J], - req_mesh_out_valid[t][c][3:0]} ), - .ready_i ( {eject_req_ready[J], - req_mesh_out_ready[t][c][3:0]} ), - .data_o ( {eject_req[J], - req_mesh_out[t][c][3:0]} ), - .credit_i ( '1 ), - .offload_req_o ( ), - .offload_rsp_i ( '0 ) + .test_enable_i ( 1'b0 ), + .xy_id_i ( group_xy_id_i ), + .id_route_map_i ( '0 ), + .valid_i ( {packed_req_valid[noc_port], + req_mesh_in_valid[t][n][3:0]} ), + .ready_o ( {packed_req_ready[noc_port], + req_mesh_in_ready[t][n][3:0]} ), + .data_i ( {packed_req[noc_port], + req_mesh_in[t][n][3:0]} ), + .credit_o ( ), + .valid_o ( {eject_req_valid[noc_port], + req_mesh_out_valid[t][n][3:0]} ), + .ready_i ( {eject_req_ready[noc_port], + req_mesh_out_ready[t][n][3:0]} ), + .data_o ( {eject_req[noc_port], + req_mesh_out[t][n][3:0]} ), + .credit_i ( '1 ), + .offload_req_o ( ), + .offload_rsp_i ( '0 ) ); end end @@ -373,8 +358,8 @@ module cachepool_group_noc_wrapper // Per-tile per-channel rsp floo_router // ----------------------------------------------------------------------- for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_rsp_router_t - for (genvar c = 0; c < NumNoCPortsPerTile; c++) begin : gen_rsp_router_c - localparam int unsigned J = t * NumNoCPortsPerTile + c; + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_rsp_router_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; floo_router #( .NumRoutes ( 5 ), .NumVirtChannels ( 1 ), @@ -391,25 +376,25 @@ module cachepool_group_noc_wrapper ) i_rsp_router ( .clk_i, .rst_ni, - .test_enable_i ( 1'b0 ), - .xy_id_i ( group_xy_id_i ), - .id_route_map_i ( '0 ), - .valid_i ( {inject_rsp_valid[J], - rsp_mesh_in_valid[t][c][3:0]} ), - .ready_o ( {inject_rsp_ready[J], - rsp_mesh_in_ready[t][c][3:0]} ), - .data_i ( {inject_rsp[J], - rsp_mesh_in[t][c][3:0]} ), - .credit_o ( ), - .valid_o ( {eject_rsp_valid[J], - rsp_mesh_out_valid[t][c][3:0]} ), - .ready_i ( {eject_rsp_ready[J], - rsp_mesh_out_ready[t][c][3:0]} ), - .data_o ( {eject_rsp[J], - rsp_mesh_out[t][c][3:0]} ), - .credit_i ( '1 ), - .offload_req_o ( ), - .offload_rsp_i ( '0 ) + .test_enable_i ( 1'b0 ), + .xy_id_i ( group_xy_id_i ), + .id_route_map_i ( '0 ), + .valid_i ( {inject_rsp_valid[noc_port], + rsp_mesh_in_valid[t][n][3:0]} ), + .ready_o ( {inject_rsp_ready[noc_port], + rsp_mesh_in_ready[t][n][3:0]} ), + .data_i ( {inject_rsp[noc_port], + rsp_mesh_in[t][n][3:0]} ), + .credit_o ( ), + .valid_o ( {eject_rsp_valid[noc_port], + rsp_mesh_out_valid[t][n][3:0]} ), + .ready_i ( {eject_rsp_ready[noc_port], + rsp_mesh_out_ready[t][n][3:0]} ), + .data_o ( {eject_rsp[noc_port], + rsp_mesh_out[t][n][3:0]} ), + .credit_i ( '1 ), + .offload_req_o ( ), + .offload_rsp_i ( '0 ) ); end end @@ -418,12 +403,12 @@ module cachepool_group_noc_wrapper // ----------------------------------------------------------------------- // Slave xbar selection signals + inject_rsp ↔ slv_xbar_slv_rsp // ----------------------------------------------------------------------- - for (genvar j = 0; j < NumNoCPortsGroup; j++) begin : gen_slv_sel_j - assign slv_xbar_slv_sel[j] = (NumTilesPerGroup == 1) - ? SlvXbarSelW'(eject_req[j].hdr.src_port_id) - : SlvXbarSelW'(eject_req[j].payload.addr[(dynamic_offset_i + NocCacheBankBits) +: NocAddrTileWidth] + for (genvar noc_port = 0; noc_port < NumNoCPortsGroup; noc_port++) begin : gen_slv_sel + assign slv_xbar_slv_sel[noc_port] = (NumTilesPerGroup == 1) + ? SlvXbarSelW'(eject_req[noc_port].hdr.src_port_id) + : SlvXbarSelW'(eject_req[noc_port].payload.addr[(dynamic_offset_i + NocCacheBankBits) +: NocAddrTileWidth] * NumRemoteGroupPortTile - + eject_req[j].hdr.src_port_id); + + eject_req[noc_port].hdr.src_port_id); end @@ -436,30 +421,30 @@ module cachepool_group_noc_wrapper // Slave-side group-wide dispatch xbar // ----------------------------------------------------------------------- reqrsp_xbar #( - .NumInp ( NumNoCPortsGroup ), - .NumOut ( NumRemoteGroupPortGroup ), - .tcdm_req_chan_t ( noc_group_req_t ), - .tcdm_rsp_chan_t ( noc_group_rsp_t ) + .NumInp ( NumNoCPortsGroup ), + .NumOut ( NumRemoteGroupPortGroup), + .tcdm_req_chan_t ( noc_group_req_t ), + .tcdm_rsp_chan_t ( noc_group_rsp_t ) ) i_noc_slv_xbar ( .clk_i, .rst_ni, - .slv_req_i ( eject_req ), - .slv_rr_i ( '0 ), + .slv_req_i ( eject_req ), + .slv_rr_i ( '0 ), .slv_req_valid_i ( eject_req_valid ), .slv_req_ready_o ( eject_req_ready ), - .slv_rsp_o ( slv_xbar_slv_rsp ), + .slv_rsp_o ( slv_xbar_slv_rsp ), .slv_rsp_valid_o ( slv_xbar_slv_rsp_valid ), .slv_rsp_ready_i ( slv_xbar_slv_rsp_ready ), - .slv_sel_i ( slv_xbar_slv_sel ), - .slv_selected_o ( ), - .mst_req_o ( slv_xbar_mst_req ), + .slv_sel_i ( slv_xbar_slv_sel ), + .slv_selected_o ( ), + .mst_req_o ( slv_xbar_mst_req ), .mst_req_valid_o ( slv_xbar_mst_req_valid ), .mst_req_ready_i ( slv_xbar_mst_req_ready ), - .mst_rsp_i ( slv_xbar_mst_rsp ), - .mst_rr_i ( '0 ), + .mst_rsp_i ( slv_xbar_mst_rsp ), + .mst_rr_i ( '0 ), .mst_rsp_valid_i ( slv_xbar_mst_rsp_valid ), .mst_rsp_ready_o ( slv_xbar_mst_rsp_ready ), - .mst_sel_i ( slv_xbar_mst_sel ) + .mst_sel_i ( slv_xbar_mst_sel ) ); @@ -468,49 +453,48 @@ module cachepool_group_noc_wrapper // ----------------------------------------------------------------------- for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_slv_deliver_t for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_slv_deliver_p - localparam int unsigned J = t * NumRemoteGroupPortTile + p; - localparam int unsigned SLV = t * NumRemoteGroupPortTile + p; + localparam int unsigned port = t * NumRemoteGroupPortTile + p; // Placeholder response routing: route response back via the NoC channel // of the same tile (t). Correct cross-tile response routing is deferred. - assign slv_xbar_mst_sel[J] = MstXbarSelW'(t * NumNoCPortsPerTile); + assign slv_xbar_mst_sel[port] = MstXbarSelW'(t * NumNoCPortsPerTile); always_comb begin : proc_req_unpack - remote_group_req_to_group[SLV].q = slv_xbar_mst_req[J].payload; - remote_group_req_to_group[SLV].q.user.src_group_x = - slv_xbar_mst_req[J].hdr.src_id.x; - remote_group_req_to_group[SLV].q.user.src_group_y = - slv_xbar_mst_req[J].hdr.src_id.y; + remote_group_req_to_group[port].q = slv_xbar_mst_req[port].payload; + remote_group_req_to_group[port].q.user.src_group_x = + slv_xbar_mst_req[port].hdr.src_id.x; + remote_group_req_to_group[port].q.user.src_group_y = + slv_xbar_mst_req[port].hdr.src_id.y; end - assign remote_group_req_to_group[SLV].q_valid = slv_xbar_mst_req_valid[J]; - assign slv_xbar_mst_req_ready[J] = - remote_group_rsp_from_group[SLV].q_ready; - assign remote_group_req_to_group[SLV].p_ready = slv_xbar_mst_rsp_ready[J]; + assign remote_group_req_to_group[port].q_valid = slv_xbar_mst_req_valid[port]; + assign slv_xbar_mst_req_ready[port] = + remote_group_rsp_from_group[port].q_ready; + assign remote_group_req_to_group[port].p_ready = slv_xbar_mst_rsp_ready[port]; - assign slv_xbar_mst_rsp[J].payload = - remote_group_rsp_from_group[SLV].p; - assign slv_xbar_mst_rsp[J].hdr.collective_op = '0; - assign slv_xbar_mst_rsp[J].hdr.src_id = group_xy_id_i; + assign slv_xbar_mst_rsp[port].payload = + remote_group_rsp_from_group[port].p; + assign slv_xbar_mst_rsp[port].hdr.collective_op = '0; + assign slv_xbar_mst_rsp[port].hdr.src_id = group_xy_id_i; if (NumGroupsX > 1) begin : gen_rsp_dst_x - assign slv_xbar_mst_rsp[J].hdr.dst_id.x = - remote_group_rsp_from_group[SLV].p.user.tile_id[NocGroupOffset +: NocGroupBitsX]; + assign slv_xbar_mst_rsp[port].hdr.dst_id.x = + remote_group_rsp_from_group[port].p.user.tile_id[NocGroupOffset +: NocGroupBitsX]; end else begin : gen_rsp_dst_x - assign slv_xbar_mst_rsp[J].hdr.dst_id.x = '0; + assign slv_xbar_mst_rsp[port].hdr.dst_id.x = '0; end if (NumGroupsY > 1) begin : gen_rsp_dst_y - assign slv_xbar_mst_rsp[J].hdr.dst_id.y = - remote_group_rsp_from_group[SLV].p.user.tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; + assign slv_xbar_mst_rsp[port].hdr.dst_id.y = + remote_group_rsp_from_group[port].p.user.tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; end else begin : gen_rsp_dst_y - assign slv_xbar_mst_rsp[J].hdr.dst_id.y = '0; + assign slv_xbar_mst_rsp[port].hdr.dst_id.y = '0; end - assign slv_xbar_mst_rsp[J].hdr.dst_id.port_id = '0; - assign slv_xbar_mst_rsp[J].hdr.src_tile_id = group_tile_sel_t'(t); - assign slv_xbar_mst_rsp[J].hdr.src_port_id = remote_group_rsp_from_group[SLV].p.user.port_id; - assign slv_xbar_mst_rsp[J].hdr.last = 1'b1; - assign slv_xbar_mst_rsp_valid[J] = - remote_group_rsp_from_group[SLV].p_valid; + assign slv_xbar_mst_rsp[port].hdr.dst_id.port_id = '0; + assign slv_xbar_mst_rsp[port].hdr.src_tile_id = group_tile_sel_t'(t); + assign slv_xbar_mst_rsp[port].hdr.src_port_id = remote_group_rsp_from_group[port].p.user.port_id; + assign slv_xbar_mst_rsp[port].hdr.last = 1'b1; + assign slv_xbar_mst_rsp_valid[port] = + remote_group_rsp_from_group[port].p_valid; end end diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index c1b5748..01a6de4 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -4,19 +4,11 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" `include "common_cells/assertions.svh" `include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" -`include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" `include "reqrsp_interface/typedef.svh" `include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" /// Tile implementation for CachePool module cachepool_tile @@ -126,8 +118,6 @@ module cachepool_tile /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. input logic [NrCores-1:0] debug_req_i, - /// End of Computing indicator to notify the host/tb - // output logic eoc_o, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. input logic [NrCores-1:0] meip_i, @@ -206,7 +196,6 @@ module cachepool_tile // --------- // TODO: Should be imported from Memory-mapped Reg logic [2:0] num_private_cache; - // half-half assign num_private_cache = l1d_private_i[2:0]; /// Minimum width to hold the core number. @@ -321,11 +310,6 @@ module cachepool_tile `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, tcdm_user_t) - `MEM_TYPEDEF_ALL(mem, tcdm_mem_addr_t, data_t, strb_t, tcdm_user_t) - - `REG_BUS_TYPEDEF_ALL(reg, addr_t, data_t, strb_t) - - typedef struct packed { int unsigned idx; addr_t start_addr; @@ -562,7 +546,7 @@ module cachepool_tile // Used to determine the mapping policy between different cache banks. // Set through CSR - logic [$clog2(TCDMAddrWidth)-1:0] dynamic_offset; + logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset; assign dynamic_offset = dynamic_offset_i; // One entry per flat remote port: flat index = j + r*NrTCDMPortsPerCore // where j is the xbar index and r is the remote slot within that xbar. @@ -813,14 +797,14 @@ module cachepool_tile .dynamic_offset_i ( dynamic_offset ), .private_start_addr_i ( private_start_addr_i ), .num_private_cache_i ( num_private_cache ), - .core_req_i ({xbar_remote_group_in_req, xbar_remote_req_gated, cache_req [j]}), - .core_rsp_ready_i ({xbar_remote_group_in_pready, xbar_remote_in_pready, cache_pready [j]}), - .core_rsp_o ({xbar_remote_group_in_rsp, xbar_remote_rsp_xbar, cache_rsp [j]}), - .tile_sel_o ( xbar_remote_req_dst ), - .remote_group_sel_o ( xbar_remote_group_out_dst ), - .mem_req_o ({xbar_remote_group_out_req, xbar_remote_req_o, cache_xbar_req [j]}), - .mem_rsp_ready_o ({xbar_remote_group_out_pready, xbar_remote_out_pready, cache_xbar_pready[j]}), - .mem_rsp_i ({xbar_remote_group_out_rsp, xbar_remote_rsp_i, cache_xbar_rsp [j]}) + .core_req_i ({xbar_remote_group_in_req, xbar_remote_req_gated, cache_req [j]}), + .core_rsp_ready_i ({xbar_remote_group_in_pready, xbar_remote_in_pready, cache_pready [j]}), + .core_rsp_o ({xbar_remote_group_in_rsp, xbar_remote_rsp_xbar, cache_rsp [j]}), + .tile_sel_o ( xbar_remote_req_dst ), + .remote_group_sel_o ( xbar_remote_group_out_dst ), + .mem_req_o ({xbar_remote_group_out_req, xbar_remote_req_o, cache_xbar_req [j]}), + .mem_rsp_ready_o ({xbar_remote_group_out_pready, xbar_remote_out_pready, cache_xbar_pready[j]}), + .mem_rsp_i ({xbar_remote_group_out_rsp, xbar_remote_rsp_i, cache_xbar_rsp [j]}) ); end else begin : gen_no_remote_group // No inter-group remote ports: instantiate interco without inter-group remote ports (backward-compatible). @@ -837,20 +821,20 @@ module cachepool_tile .tcdm_req_chan_t (tcdm_req_chan_t ), .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) ) i_cache_xbar ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .tile_id_i ( tile_id_i ), - .dynamic_offset_i ( dynamic_offset ), - .private_start_addr_i ( private_start_addr_i ), - .num_private_cache_i ( num_private_cache ), - .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), - .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), - .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), - .tile_sel_o ( xbar_remote_req_dst ), - .remote_group_sel_o ( ), - .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]} ), - .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]} ), - .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]} ) + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tile_id_i ( tile_id_i ), + .dynamic_offset_i ( dynamic_offset ), + .private_start_addr_i ( private_start_addr_i ), + .num_private_cache_i ( num_private_cache ), + .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), + .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), + .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), + .tile_sel_o ( xbar_remote_req_dst ), + .remote_group_sel_o ( ), + .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]}), + .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]}), + .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]}) ); end end @@ -958,6 +942,7 @@ module cachepool_tile localparam NumWordPerLine = L1LineWidth / DataWidth; localparam int unsigned WordBytes = DataWidth / 8; +`ifndef TARGET_SYNTHESIS initial begin $display("Cache Configuration:"); $display(" NumCtrl : %0d", NumL1CtrlTile); @@ -972,6 +957,7 @@ module cachepool_tile $display(" RefillDataWidth: %0d", RefillDataWidth); $display(" DynamicOffset : %0d", dynamic_offset); end +`endif // CL-offset mask: bits below dynamic_offset, verbatim in both directions. logic [SpatzAxiAddrWidth-1:0] bitmask_lo; @@ -1269,7 +1255,7 @@ module cachepool_tile .clk_i (clk_i ), .rst_ni (rst_ni ), .impl_i ('0 ), - .impl_o (/* unsed */ ), + .impl_o (/* unused */ ), .req_i (l1_tag_bank_req [cb][j]), .we_i (l1_tag_bank_we [cb][j]), .addr_i (l1_tag_bank_addr [cb][j]), @@ -1300,7 +1286,7 @@ module cachepool_tile .clk_i (clk_i ), .rst_ni (rst_ni ), .impl_i ('0 ), - .impl_o (/* unsed */ ), + .impl_o (/* unused */ ), .req_i ( l1_data_bank_req [cb][BaseIdx] ), .we_i ( l1_data_bank_we [cb][BaseIdx] ), .addr_i ( l1_data_bank_addr [cb][BaseIdx] ), @@ -1324,7 +1310,7 @@ module cachepool_tile // .clk_i (clk_i ), // .rst_ni (rst_ni ), // .impl_i ('0 ), - // .impl_o (/* unsed */ ), + // .impl_o (/* unused */ ), // .req_i (l1_data_bank_req [cb][j]), // .we_i (l1_data_bank_we [cb][j]), // .addr_i (l1_data_bank_addr [cb][j]), From e7edb5017541f1cc8c225e4ecf95933f57499193 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 15 May 2026 12:08:34 +0200 Subject: [PATCH 18/37] [SRC] Clean code. --- hardware/src/cachepool_cluster.sv | 16 +++++++------- hardware/src/cachepool_group.sv | 24 ++++++++++----------- hardware/src/cachepool_group_noc_wrapper.sv | 8 +++---- hardware/src/cachepool_tile.sv | 16 +++++++------- util/auto-benchmark/configs-ci.sh | 2 +- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index 70523a6..90cb169 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -105,20 +105,20 @@ module cachepool_cluster input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, + input logic debug_req_i, /// End of Computing indicator to notify the host/tb output logic [3:0] eoc_o, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. @@ -274,10 +274,10 @@ module cachepool_cluster .rst_ni ( rst_ni ), .impl_i ( impl_i ), .error_o ( group_error[g] ), - .debug_req_i ( debug_req_i[g*NumCoreGroup +: NumCoreGroup] ), - .meip_i ( meip_i [g*NumCoreGroup +: NumCoreGroup] ), - .mtip_i ( mtip_i [g*NumCoreGroup +: NumCoreGroup] ), - .msip_i ( msip_i [g*NumCoreGroup +: NumCoreGroup] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), .hart_base_id_i ( hart_base_id_i + 10'(g * NumCoreGroup) ), .tile_base_id_i ( TileIDWidth'(g * NumTilesPerGroup) ), .cluster_base_addr_i ( cluster_base_addr_i ), diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index 734617d..1637bff 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -110,18 +110,18 @@ module cachepool_group input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, + input logic debug_req_i, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. @@ -708,10 +708,10 @@ module cachepool_group .rst_ni ( rst_ni ), .impl_i ( impl_i ), .error_o ( error [t] ), - .debug_req_i ( debug_req_i [t*NumCoresTile+:NumCoresTile] ), - .meip_i ( meip_i [t*NumCoresTile+:NumCoresTile] ), - .mtip_i ( mtip_i [t*NumCoresTile+:NumCoresTile] ), - .msip_i ( msip_i [t*NumCoresTile+:NumCoresTile] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), .hart_base_id_i ( hart_base_id ), .cluster_base_addr_i ( cluster_base_addr_i ), .tile_id_i ( tile_id ), @@ -797,10 +797,10 @@ module cachepool_group .rst_ni ( rst_ni ), .impl_i ( impl_i ), .error_o ( error [t] ), - .debug_req_i ( debug_req_i [t*NumCoresTile+:NumCoresTile] ), - .meip_i ( meip_i [t*NumCoresTile+:NumCoresTile] ), - .mtip_i ( mtip_i [t*NumCoresTile+:NumCoresTile] ), - .msip_i ( msip_i [t*NumCoresTile+:NumCoresTile] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), .hart_base_id_i ( hart_base_id ), .cluster_base_addr_i ( cluster_base_addr_i ), .tile_id_i ( tile_id ), diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv index 9e14914..ce8d6c3 100644 --- a/hardware/src/cachepool_group_noc_wrapper.sv +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -61,10 +61,10 @@ module cachepool_group_noc_wrapper ) ( input logic clk_i, input logic rst_ni, - input logic [NrCores-1:0] debug_req_i, - input logic [NrCores-1:0] meip_i, - input logic [NrCores-1:0] mtip_i, - input logic [NrCores-1:0] msip_i, + input logic debug_req_i, + input logic meip_i, + input logic mtip_i, + input logic msip_i, input logic [9:0] hart_base_id_i, input logic [TileIDWidth-1:0] tile_base_id_i, input axi_addr_t cluster_base_addr_i, diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 01a6de4..967b057 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -117,18 +117,18 @@ module cachepool_tile input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, + input logic debug_req_i, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. @@ -1333,13 +1333,13 @@ module cachepool_tile interrupts_t irq; sync #(.STAGES (2)) - i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i[i]), .serial_o (irq.debug)); + i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i), .serial_o (irq.debug)); sync #(.STAGES (2)) - i_sync_meip (.clk_i, .rst_ni, .serial_i (meip_i[i]), .serial_o (irq.meip)); + i_sync_meip (.clk_i, .rst_ni, .serial_i (meip_i), .serial_o (irq.meip)); sync #(.STAGES (2)) - i_sync_mtip (.clk_i, .rst_ni, .serial_i (mtip_i[i]), .serial_o (irq.mtip)); + i_sync_mtip (.clk_i, .rst_ni, .serial_i (mtip_i), .serial_o (irq.mtip)); sync #(.STAGES (2)) - i_sync_msip (.clk_i, .rst_ni, .serial_i (msip_i[i]), .serial_o (irq.msip)); + i_sync_msip (.clk_i, .rst_ni, .serial_i (msip_i), .serial_o (irq.msip)); assign irq.mcip = cl_interrupt_i[i]; tcdm_req_t [TcdmPorts-1:0] tcdm_req_wo_user; diff --git a/util/auto-benchmark/configs-ci.sh b/util/auto-benchmark/configs-ci.sh index 1801682..70557e2 100644 --- a/util/auto-benchmark/configs-ci.sh +++ b/util/auto-benchmark/configs-ci.sh @@ -1,5 +1,5 @@ # Configs and kernel suffixes (without prefix) CONFIGS="cachepool_fpu_2g" -KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv_M512_N128_K32 fmatmul-32b_M32_N32_K32 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" +KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv_M512_N128_K32 fmatmul-32b_M64_N64_K64 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" PREFIX="test-cachepool-" # common prefix for all kernels ROOT_PATH=../.. # adjust if needed (path to repo root) From a1dd55e73b5bda1f6b40ceb1f7b0fee8abb1be44 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 15 May 2026 15:23:08 +0200 Subject: [PATCH 19/37] [SW] Fix wrong checks in matmul kernel --- software/tests/fmatmul-32b/main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/software/tests/fmatmul-32b/main.c b/software/tests/fmatmul-32b/main.c index 550cbb2..a36c01f 100644 --- a/software/tests/fmatmul-32b/main.c +++ b/software/tests/fmatmul-32b/main.c @@ -53,7 +53,7 @@ int verify_matrix(float *matrix, const float *checksum, // printf("Row: %d, Result: %x, Golden reselt: %x\n", i, print_sum, print_gold); } } - return -1; + return error; } int main() { @@ -78,6 +78,7 @@ int main() { // All cores will access the same B // Scramble based on cacheline l1d_xbar_config(5); + l1d_part(4); } // Wait for all cores to finish @@ -161,7 +162,11 @@ int main() { snrt_cluster_hw_barrier(); if (cid == 0) { - for (uint32_t j = 0; j < num_cores; j++) { + if (error[0] != 0) + printf("Core 0 error %d\n", error[0]); + + for (uint32_t j = 1; j < num_cores; j++) { + error[0] += error[j]; if (error[j] != 0) printf("Core %d error %d\n", j, error[j]); } @@ -196,6 +201,8 @@ int main() { // Wait for all cores to finish snrt_cluster_hw_barrier(); + if (error[0] > 0) + return -1; return 0; } From 617afa56dfe35bfd79b086db005c173b30f90aa9 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Tue, 19 May 2026 14:42:12 +0200 Subject: [PATCH 20/37] [SRC] Reduce offchip AXI ID width. --- config/cachepool_fpu_16g.mk | 100 +++++++++ config/cachepool_fpu_2g.mk | 2 +- config/cachepool_fpu_4g.mk | 2 +- hardware/bootrom/bootdata.cc | 4 +- hardware/bootrom/bootdata_bootrom.cc | 4 +- hardware/bootrom/bootrom.bin | Bin 136 -> 136 bytes hardware/bootrom/bootrom.dump | 4 +- hardware/bootrom/bootrom.elf | Bin 5248 -> 5248 bytes hardware/bootrom/bootrom.sv | 4 +- .../cachepool_peripheral.sv | 2 +- hardware/src/cachepool_cluster.sv | 141 ++++++++++--- hardware/src/cachepool_pkg.sv | 71 +++++-- hardware/src/cachepool_tile.sv | 3 +- hardware/tb/cachepool_cluster_wrapper.sv | 193 +++++++++++++----- hardware/tb/tb_cachepool.sv | 63 +++--- 15 files changed, 452 insertions(+), 141 deletions(-) create mode 100644 config/cachepool_fpu_16g.mk diff --git a/config/cachepool_fpu_16g.mk b/config/cachepool_fpu_16g.mk new file mode 100644 index 0000000..125a74d --- /dev/null +++ b/config/cachepool_fpu_16g.mk @@ -0,0 +1,100 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Diyou Shen, ETH Zurich + +######################### +## CachePool Cluster ## +######################### + +# Number of groups +num_groups ?= 16 + +# Number of tiles +num_tiles_per_group ?= 4 + +# Number of cores +num_cores_per_tile ?= 4 + +# Core datawidth +data_width ?= 32 + +# Core addrwidth +addr_width ?= 32 + +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + + +###################### +## CachePool Tile ## +###################### + +# Refill interconnection data width +refill_data_width ?= 128 + +##### L1 Data Cache ##### + +# L1 data cacheline width (in Bit) +l1d_cacheline_width ?= 512 + +# L1 data cache size (in KiB) +l1d_size ?= 256 + +# L1 data cache banking factor (how many banks per core?) +l1d_bank_factor ?= 1 + +# L1 coalecsing window +l1d_coal_window ?= 2 + +# L1 data cache number of ways per +l1d_num_way ?= 4 + +# L1 data cache size per tile (KiB) +l1d_tile_size ?= 256 + +# L1 data cache tag width (TODO: should be calcualted) +l1d_tag_data_width ?= 92 + +#################### +## CachePool CC ## +#################### +# Spatz fpu support? +spatz_fpu_en ?= 1 + +# Spatz number of FPU +spatz_num_fpu ?= 4 + +# Spatz number of IPU +spatz_num_ipu ?= 4 + +# Spatz max outstanding transactions +spatz_max_trans ?= 32 + +# Snitch/FPU max outstanding transactions +snitch_max_trans ?= 16 + + +##################### +## L2 Main Memory ## +##################### +# L2 number of channels +l2_channel ?= 16 + +# L2 bank width (DRAM width, change with care) +l2_bank_width ?= 512 + +# L2 interleaving factor (in order of bank_width) +l2_interleave ?= 16 + + +################## +## Peripherals ## +################## +# Hardware stack size (in Byte) +stack_hw_size ?= 1024 + +# Stack size (total, including share and private, 32'h800) +stack_tot_size ?= 2048 diff --git a/config/cachepool_fpu_2g.mk b/config/cachepool_fpu_2g.mk index 89a0815..df88162 100644 --- a/config/cachepool_fpu_2g.mk +++ b/config/cachepool_fpu_2g.mk @@ -81,7 +81,7 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 8 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 diff --git a/config/cachepool_fpu_4g.mk b/config/cachepool_fpu_4g.mk index a9a5458..3d60410 100644 --- a/config/cachepool_fpu_4g.mk +++ b/config/cachepool_fpu_4g.mk @@ -81,7 +81,7 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 16 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 diff --git a/hardware/bootrom/bootdata.cc b/hardware/bootrom/bootdata.cc index 7955d62..55a40ec 100644 --- a/hardware/bootrom/bootdata.cc +++ b/hardware/bootrom/bootdata.cc @@ -7,13 +7,13 @@ namespace sim { const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 32, + .core_count = 256, .hartid_base = 0, .tcdm_start = 0xbffff800, .tcdm_size = 0x800, .tcdm_offset = 0x0, .global_mem_start = 0x80000000, .global_mem_end = 0xa0000000, - .tile_count = 8}; + .tile_count = 64}; } // namespace sim diff --git a/hardware/bootrom/bootdata_bootrom.cc b/hardware/bootrom/bootdata_bootrom.cc index 950bc0a..db006ef 100644 --- a/hardware/bootrom/bootdata_bootrom.cc +++ b/hardware/bootrom/bootdata_bootrom.cc @@ -18,11 +18,11 @@ struct BootData { }; extern "C" const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 32, + .core_count = 256, .hartid_base = 0, .tcdm_start = 0xbffff800, .tcdm_size = 0x800, .tcdm_offset = 0x0, .global_mem_start = 0x80000000, .global_mem_end = 0xa0000000, - .tile_count = 8}; + .tile_count = 64}; diff --git a/hardware/bootrom/bootrom.bin b/hardware/bootrom/bootrom.bin index 24326b49d008c2015b19d9529061dd4d0593660c..aee11d2c064f36b54f18744ef583a63c09c1d001 100755 GIT binary patch delta 17 YcmeBR>|mVW!o|mVW!lb}3(M^ewV`8Q}04EFtBme*a diff --git a/hardware/bootrom/bootrom.dump b/hardware/bootrom/bootrom.dump index 6ed2b86..9e3cf2b 100644 --- a/hardware/bootrom/bootrom.dump +++ b/hardware/bootrom/bootrom.dump @@ -29,7 +29,7 @@ Disassembly of section .rodata: 00001040 : 1040: 1000 .2byte 0x1000 1042: 0000 .2byte 0x0 - 1044: 0020 .2byte 0x20 + 1044: 0100 .2byte 0x100 1046: 0000 .2byte 0x0 1048: 0000 .2byte 0x0 104a: 0000 .2byte 0x0 @@ -44,7 +44,7 @@ Disassembly of section .rodata: 1062: a000 .2byte 0xa000 1064: 0000 .2byte 0x0 1066: 0000 .2byte 0x0 - 1068: 0008 .2byte 0x8 + 1068: 0040 .2byte 0x40 106a: 0000 .2byte 0x0 106c: 0000 .2byte 0x0 ... diff --git a/hardware/bootrom/bootrom.elf b/hardware/bootrom/bootrom.elf index cfb4fa808e615843c403329f452281211e6a6e60..83b7f0bb367b6c3adf3549dd4ed0adcec21f2f9e 100755 GIT binary patch delta 33 ocmZqBY|z}`BEZDJIN41=iP2$mrobXbHp8r<@Y1r)f 1) begin : gen_l2_group_mux for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_ch_mux - // Collect per-group inputs for this channel. - axi_premux_cache_req_t [NumGroups-1:0] l2_mux_slv_req; - axi_premux_cache_resp_t [NumGroups-1:0] l2_mux_slv_rsp; + // Per-group ID remapper: reduces GroupAxiIdOutWidth to WideRefillIdWidth before the mux. + // axi_id_remap preserves ID independence (unlike axi_id_serialize) for performance. + // AxiSlvPortMaxUniqIds = NumSpatzOutstandingLoads*2 matches the reqrsp_to_axi MaxTrans + // so the remapper never stalls. + for (genvar g = 0; g < NumGroups; g++) begin : gen_l2_mux_remap + axi_id_remap #( + .AxiSlvPortIdWidth ( GroupAxiIdOutWidth ), + .AxiSlvPortMaxUniqIds ( NumSpatzOutstandingLoads * 2 ), + .AxiMaxTxnsPerId ( NumSpatzOutstandingLoads ), + .AxiMstPortIdWidth ( WideIdWidthPreMux ), + .slv_req_t ( axi_premux_cache_req_t ), + .slv_resp_t ( axi_premux_cache_resp_t ), + .mst_req_t ( axi_remap_cache_req_t ), + .mst_resp_t ( axi_remap_cache_resp_t ) + ) i_l2_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( wide_axi_premux_req[g][ch] ), + .slv_resp_o ( wide_axi_premux_rsp[g][ch] ), + .mst_req_o ( wide_axi_remap_req[g][ch] ), + .mst_resp_i ( wide_axi_remap_rsp[g][ch] ) + ); + end + + // Collect remapped per-group inputs for the mux. + axi_remap_cache_req_t [NumGroups-1:0] l2_mux_slv_req; + axi_remap_cache_resp_t [NumGroups-1:0] l2_mux_slv_rsp; for (genvar g = 0; g < NumGroups; g++) begin : gen_l2_mux_connect - assign l2_mux_slv_req[g] = wide_axi_premux_req[g][ch]; - assign wide_axi_premux_rsp[g][ch] = l2_mux_slv_rsp[g]; + assign l2_mux_slv_req[g] = wide_axi_remap_req[g][ch]; + assign wide_axi_remap_rsp[g][ch] = l2_mux_slv_rsp[g]; end axi_mux #( .SlvAxiIDWidth ( WideIdWidthPreMux ), - .slv_aw_chan_t ( axi_premux_cache_aw_chan_t ), + .slv_aw_chan_t ( axi_remap_cache_aw_chan_t ), .mst_aw_chan_t ( axi_slv_cache_aw_chan_t ), .w_chan_t ( axi_slv_cache_w_chan_t ), - .slv_b_chan_t ( axi_premux_cache_b_chan_t ), + .slv_b_chan_t ( axi_remap_cache_b_chan_t ), .mst_b_chan_t ( axi_slv_cache_b_chan_t ), - .slv_ar_chan_t ( axi_premux_cache_ar_chan_t ), + .slv_ar_chan_t ( axi_remap_cache_ar_chan_t ), .mst_ar_chan_t ( axi_slv_cache_ar_chan_t ), - .slv_r_chan_t ( axi_premux_cache_r_chan_t ), + .slv_r_chan_t ( axi_remap_cache_r_chan_t ), .mst_r_chan_t ( axi_slv_cache_r_chan_t ), - .slv_req_t ( axi_premux_cache_req_t ), - .slv_resp_t ( axi_premux_cache_resp_t ), + .slv_req_t ( axi_remap_cache_req_t ), + .slv_resp_t ( axi_remap_cache_resp_t ), .mst_req_t ( axi_slv_cache_req_t ), .mst_resp_t ( axi_slv_cache_resp_t ), .NoSlvPorts ( NumGroups ), @@ -569,6 +601,11 @@ module cachepool_cluster axi_narrow_req_t [NumTiles-1:0] axi_core_csr_req, axi_barrier_req; axi_narrow_resp_t [NumTiles-1:0] axi_core_csr_rsp, axi_barrier_rsp; + // Serialized CSR signals: one entry per tile plus one for the external axi_in port. + // Index [NumTiles] = axi_in_req_i, indices [NumTiles-1:0] = per-tile CSR outputs. + axi_csr_ser_req_t [NumTiles:0] axi_csr_pre_mux_req; + axi_csr_ser_resp_t [NumTiles:0] axi_csr_pre_mux_rsp; + for (genvar t = 0; t < NumTiles; t++) begin assign axi_barrier_req[t] = axi_out_req [t][ClusterPeriph]; @@ -604,20 +641,70 @@ module cachepool_cluster .cluster_periph_start_address_i ( tcdm_end_address ) ); + // Per-tile CSR ID serializers: reduce CsrAxiMstIdWidth to CsrSerIdWidth before the mux + // so the mux output stays bounded regardless of NumTiles. + for (genvar t = 0; t < NumTiles; t++) begin : gen_csr_id_serialize + axi_id_serialize #( + .AxiSlvPortIdWidth ( CsrAxiMstIdWidth ), + .AxiSlvPortMaxTxns ( 2 ), + .AxiMstPortIdWidth ( CsrSerIdWidth ), + .AxiMstPortMaxUniqIds ( 1 ), + .AxiMstPortMaxTxnsPerId ( 2 ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( SpatzAxiNarrowDataWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .AtopSupport ( 1'b0 ), + .slv_req_t ( axi_narrow_req_t ), + .slv_resp_t ( axi_narrow_resp_t ), + .mst_req_t ( axi_csr_ser_req_t ), + .mst_resp_t ( axi_csr_ser_resp_t ) + ) i_csr_id_serialize ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_core_csr_req[t] ), + .slv_resp_o ( axi_core_csr_rsp[t] ), + .mst_req_o ( axi_csr_pre_mux_req[t] ), + .mst_resp_i ( axi_csr_pre_mux_rsp[t] ) + ); + end + + // Serializer for the external axi_in port (SoC CSR access). + axi_id_serialize #( + .AxiSlvPortIdWidth ( AxiIdWidthIn ), + .AxiSlvPortMaxTxns ( 2 ), + .AxiMstPortIdWidth ( CsrSerIdWidth ), + .AxiMstPortMaxUniqIds ( 1 ), + .AxiMstPortMaxTxnsPerId ( 2 ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( SpatzAxiNarrowDataWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .AtopSupport ( 1'b0 ), + .slv_req_t ( axi_in_req_t ), + .slv_resp_t ( axi_in_resp_t ), + .mst_req_t ( axi_csr_ser_req_t ), + .mst_resp_t ( axi_csr_ser_resp_t ) + ) i_csr_in_id_serialize ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_in_req_i ), + .slv_resp_o ( axi_in_resp_o ), + .mst_req_o ( axi_csr_pre_mux_req[NumTiles] ), + .mst_resp_i ( axi_csr_pre_mux_rsp[NumTiles] ) + ); axi_mux #( - .SlvAxiIDWidth ( CsrAxiMstIdWidth ), - .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), + .SlvAxiIDWidth ( CsrSerIdWidth ), + .slv_aw_chan_t ( axi_csr_ser_aw_chan_t ), .mst_aw_chan_t ( axi_csr_slv_aw_chan_t ), .w_chan_t ( axi_csr_slv_w_chan_t ), - .slv_b_chan_t ( axi_csr_mst_b_chan_t ), + .slv_b_chan_t ( axi_csr_ser_b_chan_t ), .mst_b_chan_t ( axi_csr_slv_b_chan_t ), - .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), + .slv_ar_chan_t ( axi_csr_ser_ar_chan_t ), .mst_ar_chan_t ( axi_csr_slv_ar_chan_t ), - .slv_r_chan_t ( axi_csr_mst_r_chan_t ), + .slv_r_chan_t ( axi_csr_ser_r_chan_t ), .mst_r_chan_t ( axi_csr_slv_r_chan_t ), - .slv_req_t ( axi_csr_mst_req_t ), - .slv_resp_t ( axi_csr_mst_resp_t ), + .slv_req_t ( axi_csr_ser_req_t ), + .slv_resp_t ( axi_csr_ser_resp_t ), .mst_req_t ( axi_csr_slv_req_t ), .mst_resp_t ( axi_csr_slv_resp_t ), .NoSlvPorts ( NumTiles + 1 ), @@ -632,8 +719,8 @@ module cachepool_cluster .clk_i ( clk_i ), .rst_ni ( rst_ni ), .test_i ('0 ), - .slv_reqs_i ( {axi_in_req_i, axi_core_csr_req} ), - .slv_resps_o ( {axi_in_resp_o, axi_core_csr_rsp} ), + .slv_reqs_i ( axi_csr_pre_mux_req ), + .slv_resps_o ( axi_csr_pre_mux_rsp ), .mst_req_o ( axi_csr_req ), .mst_resp_i ( axi_csr_rsp ) ); diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index d26d7d5..6dd9af7 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -192,37 +192,62 @@ package cachepool_pkg; localparam int unsigned ClusterRouteIdWidth = $clog2(NumClusterMst); /***** ID Width Topology (Tile -> Group -> Cluster) *****/ + // TileAxiIdWidth: base iCache/DMA AXI ID bits per tile before tile-index bits are added. + // Determines how many outstanding refills the iCache can track (2^TileAxiIdWidth = 8). + // This is the "tile_local_bits" field described above. localparam int unsigned TileAxiIdWidth = 3; localparam int unsigned GroupAxiIdWidth = TileAxiIdWidth + $clog2(NumTiles); localparam int unsigned ClusterAxiIdWidth = GroupAxiIdWidth + ClusterRouteIdWidth; - - // legacy naming + // Alias used by the Spatz-generated wrapper and testbench templates. localparam int unsigned SpatzAxiIdInWidth = ClusterAxiIdWidth; - // localparam int unsigned SpatzAxiIdInWidth = TileAxiIdWidth; + // Per-group AXI output ID width (pre multi-group mux). + // The +1 comes from reqrsp_to_axi, which tags each burst with one extra bit. localparam int unsigned GroupAxiIdOutWidth = ClusterAxiIdWidth + 1; + // Bounded per-group refill ID width: uses NumTilesPerGroup (not NumTiles) so the + // ID space stays fixed regardless of total system size. axi_id_remap at each group + // output reduces GroupAxiIdOutWidth to this before the inter-group mux / future NoC. + // For NumGroups == 1, NumTilesPerGroup == NumTiles so this equals GroupAxiIdOutWidth. + localparam int unsigned WideRefillIdWidth = TileAxiIdWidth + $clog2(NumTilesPerGroup) + ClusterRouteIdWidth + 1; // Cluster-level AXI output ID width: widened by multi-group mux. - // When NumGroups == 1, $clog2(1) == 0 so this equals GroupAxiIdOutWidth. + // When NumGroups == 1, $clog2(1) == 0 so this equals WideRefillIdWidth == GroupAxiIdOutWidth. localparam int unsigned GroupMuxIdBits = (NumGroups > 1) ? $clog2(NumGroups) : 0; - localparam int unsigned SpatzAxiIdOutWidth = GroupAxiIdOutWidth + GroupMuxIdBits; + localparam int unsigned SpatzAxiIdOutWidth = WideRefillIdWidth + GroupMuxIdBits; // Fixed AXI ID width for IWC localparam int unsigned IwcAxiIdOutWidth = SpatzAxiIdOutWidth + 1; - localparam int unsigned CsrAxiMstIdWidth = ClusterAxiIdWidth; - localparam int unsigned CsrAxiSlvIdWidth = ClusterAxiIdWidth + $clog2(NumTiles+1); + // Cluster wrapper external output AXI ID width, after the wrapper-level axi_id_remap. + // Reduces the fat SpatzAxiIdOutWidth presented to the DRAM controller. + // Must satisfy: WrapperAxiIdOutWidth >= $clog2(NumAxiMaxTrans) = $clog2(32) = 5. + localparam int unsigned WrapperAxiIdOutWidth = 6; + // External SoC/testbench input AXI ID width (host → cluster direction). + // axi_id_remap in the wrapper expands these to SpatzAxiIdInWidth internally. + localparam int unsigned WrapperAxiIdInWidth = 4; + // External narrow output AXI ID width for the UART port (cluster → SoC direction). + // axi_id_remap in the wrapper compresses SpatzAxiUartIdWidth to this. + localparam int unsigned WrapperAxiNarrowIdOutWidth = 4; - // Base ID width 6, plus tile mux => adding clog(tile) - localparam int unsigned SpatzAxiNarrowIdWidth = 6 + $clog2(NumTiles); - // UART ID width, with an extra xbar + localparam int unsigned CsrAxiMstIdWidth = ClusterAxiIdWidth; + // ID width after per-master serialization before the CSR mux. + // axi_id_serialize at each CSR master reduces CsrAxiMstIdWidth to this, + // keeping the mux output (CsrAxiSlvIdWidth) bounded regardless of NumTiles. + // Must be > 1: axi_id_serialize internally uses axi_id_prepend which requires + // AxiMstPortIdWidth > MuxIdWidth (= 1 when AxiMstPortMaxUniqIds = 1). + localparam int unsigned CsrSerIdWidth = 2; + localparam int unsigned CsrAxiSlvIdWidth = CsrSerIdWidth + $clog2(NumTiles+1); + + // Narrow AXI ID width = ClusterAxiIdWidth (same field structure, used on the narrow path). + localparam int unsigned SpatzAxiNarrowIdWidth = ClusterAxiIdWidth; + // UART ID width: narrow path muxed across all tiles adds $clog2(NumTiles) bits. localparam int unsigned SpatzAxiUartIdWidth = SpatzAxiNarrowIdWidth + $clog2(NumTiles); - // BootROM AXI ID width: wide data bus, muxed from NumTiles tile ports. + // BootROM AXI ID width: wide data bus, muxed from NumTilesPerGroup tile ports per group. // The group's axi_mst_cache slave ID width = GroupAxiIdWidth + 1 - // (cluster passes WideIdWidthIn = SpatzAxiIdOutWidth - clog2(NumClusterMst) - // = ClusterAxiIdWidth + 1 - ClusterRouteIdWidth = GroupAxiIdWidth + 1). - // The mux master adds $clog2(NumTiles) bits on top. - localparam int unsigned BootRomAxiSlvIdWidth = GroupAxiIdWidth + 1 + $clog2(NumTiles); + // (cluster passes WideIdWidthIn = SpatzAxiIdOutWidth - ClusterRouteIdWidth - GroupMuxIdBits + // = GroupAxiIdWidth + 1). + // The per-group BootROM mux master adds $clog2(NumTilesPerGroup) bits on top. + localparam int unsigned BootRomAxiSlvIdWidth = GroupAxiIdWidth + 1 + $clog2(NumTilesPerGroup); /***** Tile Ports *****/ // We have three sets of AXI ports for each tile: @@ -315,9 +340,13 @@ package cachepool_pkg; typedef logic [SpatzAxiUartIdWidth-1:0] axi_uart_id_t; typedef logic [CsrAxiMstIdWidth-1:0] axi_id_csr_mst_t; + typedef logic [CsrSerIdWidth-1:0] axi_id_csr_ser_t; typedef logic [CsrAxiSlvIdWidth-1:0] axi_id_csr_slv_t; - typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; + typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; + typedef logic [WrapperAxiIdOutWidth-1:0] axi_id_wrapper_out_t; + typedef logic [WrapperAxiIdInWidth-1:0] axi_id_wrapper_in_t; + typedef logic [WrapperAxiNarrowIdOutWidth-1:0] axi_id_wrapper_narrow_out_t; typedef logic [BootRomAxiSlvIdWidth-1:0] axi_bootrom_slv_id_t; @@ -507,10 +536,18 @@ package cachepool_pkg; `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) // Per-group AXI output: narrower ID (pre multi-group mux). `AXI_TYPEDEF_ALL(spatz_axi_group_out, axi_addr_t, axi_id_group_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Wrapper-level external output type: ID narrowed from SpatzAxiIdOutWidth to WrapperAxiIdOutWidth. + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_out, axi_addr_t, axi_id_wrapper_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Wrapper-level external input type: narrow ID from SoC (WrapperAxiIdInWidth → SpatzAxiIdInWidth inside). + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_in, axi_addr_t, axi_id_wrapper_in_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // Wrapper-level external narrow output type: ID compressed from SpatzAxiUartIdWidth to WrapperAxiNarrowIdOutWidth. + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_narrow_out, axi_addr_t, axi_id_wrapper_narrow_out_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(axi_uart, axi_addr_t, axi_uart_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(axi_csr_mst, axi_addr_t, axi_id_csr_mst_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // Serialized CSR type: 1-bit ID output of axi_id_serialize, fed into the CSR mux slave ports. + `AXI_TYPEDEF_ALL(axi_csr_ser, axi_addr_t, axi_id_csr_ser_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(axi_csr_slv, axi_addr_t, axi_id_csr_slv_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) // BootROM: wide data bus (same payload as cache), slv = post-mux (widened ID) `AXI_TYPEDEF_ALL(axi_bootrom_slv, axi_addr_t, axi_bootrom_slv_id_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 967b057..144bfa2 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -1084,6 +1084,7 @@ module cachepool_tile .CacheLineWidth (L1LineWidth ), .SetAssociativity (L1AssoPerCtrl ), .BankFactor (L1BankFactor ), + .LogDebug (0 ), .RefillDataWidth (RefillDataWidth ), // Type .core_meta_t (tcdm_user_t ), @@ -1359,7 +1360,7 @@ module cachepool_tile .DataWidth (NarrowDataWidth ), .UserWidth (AxiUserWidth ), .DMADataWidth (AxiDataWidth ), - .DMAIdWidth (AxiIdWidthIn ), + .DMAIdWidth (WideIdWidthIn ), .SnitchPMACfg (SnitchPMACfg ), .DMAAxiReqFifoDepth (DMAAxiReqFifoDepth ), .DMAReqFifoDepth (DMAReqFifoDepth ), diff --git a/hardware/tb/cachepool_cluster_wrapper.sv b/hardware/tb/cachepool_cluster_wrapper.sv index a9dba20..cfac974 100644 --- a/hardware/tb/cachepool_cluster_wrapper.sv +++ b/hardware/tb/cachepool_cluster_wrapper.sv @@ -13,50 +13,65 @@ module cachepool_cluster_wrapper import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; #( - parameter int unsigned AxiAddrWidth = SpatzAxiAddrWidth, - parameter int unsigned AxiDataWidth = SpatzAxiDataWidth, - parameter int unsigned AxiUserWidth = SpatzAxiUserWidth, - parameter int unsigned AxiInIdWidth = SpatzAxiIdInWidth, - parameter int unsigned AxiOutIdWidth = SpatzAxiIdOutWidth, - - parameter type axi_in_resp_t = spatz_axi_in_resp_t, - parameter type axi_in_req_t = spatz_axi_in_req_t, - - parameter type axi_out_resp_t = spatz_axi_out_resp_t, - parameter type axi_out_req_t = spatz_axi_out_req_t, - - parameter type axi_narrow_req_t = spatz_axi_narrow_req_t, - parameter type axi_narrow_resp_t = spatz_axi_narrow_resp_t + parameter int unsigned AxiAddrWidth = SpatzAxiAddrWidth, + parameter int unsigned AxiDataWidth = SpatzAxiDataWidth, + parameter int unsigned AxiUserWidth = SpatzAxiUserWidth, + // External input ID width (SoC/testbench → wrapper); remapped to SpatzAxiIdInWidth inside. + parameter int unsigned AxiInIdWidth = WrapperAxiIdInWidth, + // External wide output ID width (wrapper → DRAM); remapped from SpatzAxiIdOutWidth inside. + parameter int unsigned AxiOutIdWidth = WrapperAxiIdOutWidth, + // External narrow output ID width (UART, wrapper → SoC); remapped from SpatzAxiUartIdWidth inside. + parameter int unsigned AxiNarrowOutIdWidth = WrapperAxiNarrowIdOutWidth, + + // External input types use the wrapper-narrowed ID (WrapperAxiIdInWidth). + parameter type axi_in_req_t = spatz_axi_wrapper_in_req_t, + parameter type axi_in_resp_t = spatz_axi_wrapper_in_resp_t, + + // External wide output types use the wrapper-narrowed ID (WrapperAxiIdOutWidth). + parameter type axi_out_req_t = spatz_axi_wrapper_out_req_t, + parameter type axi_out_resp_t = spatz_axi_wrapper_out_resp_t, + + // External narrow output types use the wrapper-narrowed ID (WrapperAxiNarrowIdOutWidth). + parameter type axi_narrow_out_req_t = spatz_axi_wrapper_narrow_out_req_t, + parameter type axi_narrow_out_resp_t = spatz_axi_wrapper_narrow_out_resp_t )( - input logic clk_i, - input logic rst_ni, - output logic [3:0] eoc_o, - input logic debug_req_i, - - input logic meip_i, - input logic mtip_i, - input logic msip_i, - output logic cluster_probe_o, - input axi_in_req_t axi_in_req_i, - output axi_in_resp_t axi_in_resp_o, - /// AXI Narrow out-port (UART) - output axi_uart_req_t axi_narrow_req_o, - input axi_uart_resp_t axi_narrow_resp_i, - output axi_out_req_t [NumClusterSlv-1:0] axi_out_req_o, - input axi_out_resp_t [NumClusterSlv-1:0] axi_out_resp_i + input logic clk_i, + input logic rst_ni, + output logic [3:0] eoc_o, + input logic debug_req_i, + + input logic meip_i, + input logic mtip_i, + input logic msip_i, + output logic cluster_probe_o, + // AXI slave port (from SoC/testbench); external ID = AxiInIdWidth. + input axi_in_req_t axi_in_req_i, + output axi_in_resp_t axi_in_resp_o, + /// AXI Narrow out-port (UART); external ID = AxiNarrowOutIdWidth. + output axi_narrow_out_req_t axi_narrow_req_o, + input axi_narrow_out_resp_t axi_narrow_resp_i, + // AXI wide master ports (to DRAM); external ID = AxiOutIdWidth. + output axi_out_req_t [NumClusterSlv-1:0] axi_out_req_o, + input axi_out_resp_t [NumClusterSlv-1:0] axi_out_resp_i ); - - spatz_axi_iwc_out_req_t [NumClusterSlv-1:0] axi_from_cluster_iwc_req; - spatz_axi_iwc_out_resp_t [NumClusterSlv-1:0] axi_from_cluster_iwc_resp; + // Internal signals between wrapper remappers and cluster (fat IDs). + spatz_axi_in_req_t axi_cluster_in_req; + spatz_axi_in_resp_t axi_cluster_in_resp; + axi_uart_req_t axi_cluster_narrow_req; + axi_uart_resp_t axi_cluster_narrow_resp; + spatz_axi_out_req_t [NumClusterSlv-1:0] axi_cluster_out_req; + spatz_axi_out_resp_t [NumClusterSlv-1:0] axi_cluster_out_resp; // Spatz cluster under test. + // Internal AXI types are fixed (full-width IDs); the wrapper remaps at both boundaries. cachepool_cluster #( .AxiAddrWidth (AxiAddrWidth ), .AxiDataWidth (AxiDataWidth ), - .AxiIdWidthIn (AxiInIdWidth ), - .AxiIdWidthOut (AxiOutIdWidth ), + // Cluster always sees the full internal ID width on its slave port. + .AxiIdWidthIn (SpatzAxiIdInWidth ), + .AxiIdWidthOut (SpatzAxiIdOutWidth ), .AxiUserWidth (AxiUserWidth ), .BootAddr (BootAddr ), .UartAddr (UartAddr ), @@ -74,12 +89,15 @@ module cachepool_cluster_wrapper .NumIntOutstandingLoads (NumIntOutstandingLoads ), .NumIntOutstandingMem (NumIntOutstandingMem ), .NumSpatzOutstandingLoads (NumSpatzOutstandingLoads ), - .axi_in_req_t (axi_in_req_t ), - .axi_in_resp_t (axi_in_resp_t ), - .axi_narrow_req_t (axi_narrow_req_t ), - .axi_narrow_resp_t (axi_narrow_resp_t ), - .axi_out_req_t (axi_out_req_t ), - .axi_out_resp_t (axi_out_resp_t ), + // Cluster slave port uses full internal type (remap is above this level). + .axi_in_req_t (spatz_axi_in_req_t ), + .axi_in_resp_t (spatz_axi_in_resp_t ), + // Cluster per-tile narrow type (internal crossbar width, not the UART mux output). + .axi_narrow_req_t (spatz_axi_narrow_req_t ), + .axi_narrow_resp_t (spatz_axi_narrow_resp_t ), + // Cluster internally uses the fat output type; the wrapper remaps it. + .axi_out_req_t (spatz_axi_out_req_t ), + .axi_out_resp_t (spatz_axi_out_resp_t ), .Xdma (4'h0 ), .DMAAxiReqFifoDepth (3 ), .DMAReqFifoDepth (3 ), @@ -97,22 +115,86 @@ module cachepool_cluster_wrapper .eoc_o (eoc_o ), .impl_i ('0 ), .error_o ( ), - .debug_req_i ({NumCores{debug_req_i}} ), - .meip_i ({NumCores{meip_i}} ), - .mtip_i ({NumCores{mtip_i}} ), - .msip_i ({NumCores{msip_i}} ), + .debug_req_i (debug_req_i ), + .meip_i (meip_i ), + .mtip_i (mtip_i ), + .msip_i (msip_i ), .hart_base_id_i (10'h0 ), .cluster_base_addr_i (TCDMStartAddr ), .cluster_probe_o (cluster_probe_o ), - .axi_in_req_i , - .axi_in_resp_o , - .axi_narrow_req_o , - .axi_narrow_resp_i , - // AXI Master Port - .axi_out_req_o ( axi_out_req_o ), - .axi_out_resp_i ( axi_out_resp_i ) + // Remapped internal connections. + .axi_in_req_i (axi_cluster_in_req ), + .axi_in_resp_o (axi_cluster_in_resp ), + .axi_narrow_req_o (axi_cluster_narrow_req ), + .axi_narrow_resp_i (axi_cluster_narrow_resp ), + // AXI Master Port (fat IDs; wrapper remaps before external port). + .axi_out_req_o (axi_cluster_out_req ), + .axi_out_resp_i (axi_cluster_out_resp ) + ); + + // Expand WrapperAxiIdInWidth -> SpatzAxiIdInWidth on the cluster slave port. + // The external SoC/testbench drives narrow IDs; the cluster expects full-width IDs. + axi_id_remap #( + .AxiSlvPortIdWidth ( WrapperAxiIdInWidth ), + // Up to 2^WrapperAxiIdInWidth = 16 unique IDs from external host. + .AxiSlvPortMaxUniqIds ( 2**WrapperAxiIdInWidth ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( SpatzAxiIdInWidth ), + .slv_req_t ( axi_in_req_t ), + .slv_resp_t ( axi_in_resp_t ), + .mst_req_t ( spatz_axi_in_req_t ), + .mst_resp_t ( spatz_axi_in_resp_t ) + ) i_in_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_in_req_i ), + .slv_resp_o ( axi_in_resp_o ), + .mst_req_o ( axi_cluster_in_req ), + .mst_resp_i ( axi_cluster_in_resp ) ); + // Compress SpatzAxiUartIdWidth -> WrapperAxiNarrowIdOutWidth on the UART master port. + axi_id_remap #( + .AxiSlvPortIdWidth ( SpatzAxiUartIdWidth ), + // Cap at 2^WrapperAxiNarrowIdOutWidth unique IDs toward the SoC. + .AxiSlvPortMaxUniqIds ( 2**WrapperAxiNarrowIdOutWidth ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( WrapperAxiNarrowIdOutWidth ), + .slv_req_t ( axi_uart_req_t ), + .slv_resp_t ( axi_uart_resp_t ), + .mst_req_t ( axi_narrow_out_req_t ), + .mst_resp_t ( axi_narrow_out_resp_t ) + ) i_narrow_out_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_cluster_narrow_req ), + .slv_resp_o ( axi_cluster_narrow_resp ), + .mst_req_o ( axi_narrow_req_o ), + .mst_resp_i ( axi_narrow_resp_i ) + ); + + // Reduce SpatzAxiIdOutWidth -> WrapperAxiIdOutWidth per DRAM channel. + // NumAxiMaxTrans = 32 outstanding per channel; 6 bits gives 64 unique ID slots. + for (genvar ch = 0; ch < NumClusterSlv; ch++) begin : gen_out_id_remap + axi_id_remap #( + .AxiSlvPortIdWidth ( SpatzAxiIdOutWidth ), + .AxiSlvPortMaxUniqIds ( NumAxiMaxTrans ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( WrapperAxiIdOutWidth ), + .slv_req_t ( spatz_axi_out_req_t ), + .slv_resp_t ( spatz_axi_out_resp_t ), + .mst_req_t ( spatz_axi_wrapper_out_req_t ), + .mst_resp_t ( spatz_axi_wrapper_out_resp_t ) + ) i_out_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_cluster_out_req [ch] ), + .slv_resp_o ( axi_cluster_out_resp [ch] ), + .mst_req_o ( axi_out_req_o [ch] ), + .mst_resp_i ( axi_out_resp_i [ch] ) + ); + end + // AXI utilization monitor `ifndef TARGET_SYNTHESIS typedef logic [31:0] cnt_t; @@ -241,11 +323,14 @@ module cachepool_cluster_wrapper if (AxiUserWidth != SpatzAxiUserWidth) $error("[spatz_cluster_wrapper] AXI User Width does not match the configuration."); - if (AxiInIdWidth != SpatzAxiIdInWidth) + if (AxiInIdWidth != WrapperAxiIdInWidth) $error("[spatz_cluster_wrapper] AXI Id Width (In) does not match the configuration."); - if (AxiOutIdWidth != SpatzAxiIdOutWidth) + if (AxiOutIdWidth != WrapperAxiIdOutWidth) $error("[spatz_cluster_wrapper] AXI Id Width (Out) does not match the configuration."); + + if (AxiNarrowOutIdWidth != WrapperAxiNarrowIdOutWidth) + $error("[spatz_cluster_wrapper] AXI Narrow Id Width (Out) does not match the configuration."); `endif endmodule diff --git a/hardware/tb/tb_cachepool.sv b/hardware/tb/tb_cachepool.sv index c1276bf..385d699 100644 --- a/hardware/tb/tb_cachepool.sv +++ b/hardware/tb/tb_cachepool.sv @@ -66,18 +66,19 @@ module tb_cachepool; localparam NumAXISlaves = 2; localparam NumRules = NumAXISlaves-1; - // Spatz wide port to SoC (currently dram) - spatz_axi_out_req_t [NumL2Channel-1:0] axi_from_cluster_req; - spatz_axi_out_resp_t [NumL2Channel-1:0] axi_from_cluster_resp; - // From SoC to Spatz - spatz_axi_in_req_t axi_to_cluster_req; - spatz_axi_in_resp_t axi_to_cluster_resp; + // Spatz wide port to SoC (currently dram); IDs narrowed by wrapper-level axi_id_remap. + spatz_axi_wrapper_out_req_t [NumL2Channel-1:0] axi_from_cluster_req; + spatz_axi_wrapper_out_resp_t [NumL2Channel-1:0] axi_from_cluster_resp; + // From SoC to Spatz; IDs expanded by wrapper-level axi_id_remap (WrapperAxiIdInWidth → SpatzAxiIdInWidth). + spatz_axi_wrapper_in_req_t axi_to_cluster_req; + spatz_axi_wrapper_in_resp_t axi_to_cluster_resp; - axi_uart_req_t axi_uart_req; - axi_uart_resp_t axi_uart_rsp; + // UART; IDs compressed by wrapper-level axi_id_remap (SpatzAxiUartIdWidth → WrapperAxiNarrowIdOutWidth). + spatz_axi_wrapper_narrow_out_req_t axi_uart_req; + spatz_axi_wrapper_narrow_out_resp_t axi_uart_rsp; // DRAM Scrambled request - spatz_axi_out_req_t [NumL2Channel-1:0] axi_dram_req; + spatz_axi_wrapper_out_req_t [NumL2Channel-1:0] axi_dram_req; /********* @@ -141,13 +142,13 @@ module tb_cachepool; reqrsp_cluster_in_rsp_t to_cluster_rsp; reqrsp_to_axi #( - .DataWidth (SpatzDataWidth ), - .AxiUserWidth(SpatzAxiUserWidth ), - .UserWidth ($bits(tcdm_user_t) ), - .axi_req_t (spatz_axi_in_req_t ), - .axi_rsp_t (spatz_axi_in_resp_t ), - .reqrsp_req_t(reqrsp_cluster_in_req_t), - .reqrsp_rsp_t(reqrsp_cluster_in_rsp_t) + .DataWidth (SpatzDataWidth ), + .AxiUserWidth(SpatzAxiUserWidth ), + .UserWidth ($bits(tcdm_user_t) ), + .axi_req_t (spatz_axi_wrapper_in_req_t ), + .axi_rsp_t (spatz_axi_wrapper_in_resp_t ), + .reqrsp_req_t(reqrsp_cluster_in_req_t ), + .reqrsp_rsp_t(reqrsp_cluster_in_rsp_t ) ) i_reqrsp_to_axi ( .clk_i (clk ), .rst_ni (rst_n ), @@ -293,8 +294,8 @@ module tb_cachepool; **********/ axi_uart #( - .axi_req_t (axi_uart_req_t ), - .axi_resp_t(axi_uart_resp_t) + .axi_req_t (spatz_axi_wrapper_narrow_out_req_t ), + .axi_resp_t(spatz_axi_wrapper_narrow_out_resp_t) ) i_axi_uart ( .clk_i (clk ), .rst_ni (rst_n ), @@ -380,19 +381,19 @@ module tb_cachepool; for (genvar mem = 0; mem < NumL2Channel; mem++) begin: gen_dram axi_dram_sim #( - .BASE ( DramBase ), - .DRAMType ( DramType ), - .AxiAddrWidth ( SpatzAxiAddrWidth ), - .AxiDataWidth ( SpatzAxiDataWidth ), - .AxiIdWidth ( SpatzAxiIdOutWidth ), - .AxiUserWidth ( SpatzAxiUserWidth ), - .axi_req_t ( spatz_axi_out_req_t ), - .axi_resp_t ( spatz_axi_out_resp_t ), - .axi_ar_t ( spatz_axi_out_ar_chan_t ), - .axi_r_t ( spatz_axi_out_r_chan_t ), - .axi_aw_t ( spatz_axi_out_aw_chan_t ), - .axi_w_t ( spatz_axi_out_w_chan_t ), - .axi_b_t ( spatz_axi_out_b_chan_t ) + .BASE ( DramBase ), + .DRAMType ( DramType ), + .AxiAddrWidth ( SpatzAxiAddrWidth ), + .AxiDataWidth ( SpatzAxiDataWidth ), + .AxiIdWidth ( WrapperAxiIdOutWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .axi_req_t ( spatz_axi_wrapper_out_req_t ), + .axi_resp_t ( spatz_axi_wrapper_out_resp_t ), + .axi_ar_t ( spatz_axi_wrapper_out_ar_chan_t ), + .axi_r_t ( spatz_axi_wrapper_out_r_chan_t ), + .axi_aw_t ( spatz_axi_wrapper_out_aw_chan_t ), + .axi_w_t ( spatz_axi_wrapper_out_w_chan_t ), + .axi_b_t ( spatz_axi_wrapper_out_b_chan_t ) ) i_axi_dram_sim ( .clk_i ( clk ), .rst_ni ( rst_n ), From 658807ce15caf37bf3faa9417c2981e7ac6586ee Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Tue, 19 May 2026 15:43:16 +0200 Subject: [PATCH 21/37] [SRC] Fix a lint issue and add needed cut. --- hardware/src/cachepool_cluster.sv | 12 +++++-- hardware/src/cachepool_tile.sv | 60 +++++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index 7379126..60996e0 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -657,7 +657,11 @@ module cachepool_cluster .slv_req_t ( axi_narrow_req_t ), .slv_resp_t ( axi_narrow_resp_t ), .mst_req_t ( axi_csr_ser_req_t ), - .mst_resp_t ( axi_csr_ser_resp_t ) + .mst_resp_t ( axi_csr_ser_resp_t ), + // Provide one dummy entry to avoid [IdMapNumEntries-1:0] underflow when 0. + // Entry maps ID 0 -> 0, which is identical to the default modulo formula. + .IdMapNumEntries ( 1 ), + .IdMap ( '{'{32'd0, 32'd0}} ) ) i_csr_id_serialize ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), @@ -682,7 +686,11 @@ module cachepool_cluster .slv_req_t ( axi_in_req_t ), .slv_resp_t ( axi_in_resp_t ), .mst_req_t ( axi_csr_ser_req_t ), - .mst_resp_t ( axi_csr_ser_resp_t ) + .mst_resp_t ( axi_csr_ser_resp_t ), + // Provide one dummy entry to avoid [IdMapNumEntries-1:0] underflow when 0. + // Entry maps ID 0 -> 0, which is identical to the default modulo formula. + .IdMapNumEntries ( 1 ), + .IdMap ( '{'{32'd0, 32'd0}} ) ) i_csr_in_id_serialize ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 144bfa2..eb86651 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -910,21 +910,51 @@ module cachepool_tile assign cache_rsp_reg.p.write = cache_rsp_write[cb][j]; end else begin : gen_no_amo - // Bypass AMO and registers - assign cache_req_valid[cb][j] = cache_xbar_req [j][cb].q_valid; - assign cache_rsp_ready[cb][j] = cache_xbar_pready[j][cb]; - assign cache_req_addr [cb][j] = cache_xbar_req [j][cb].q.addr; - assign cache_req_meta [cb][j] = cache_xbar_req [j][cb].q.user; - assign cache_req_write[cb][j] = cache_xbar_req [j][cb].q.write; - assign cache_req_data [cb][j] = cache_xbar_req [j][cb].q.data; - assign cache_req_strb [cb][j] = cache_xbar_req [j][cb].q.strb; - - assign cache_xbar_rsp[j][cb].p_valid = cache_rsp_valid[cb][j]; - assign cache_xbar_rsp[j][cb].q_ready = cache_req_ready[cb][j]; - assign cache_xbar_rsp[j][cb].p.data = cache_rsp_data [cb][j]; - assign cache_xbar_rsp[j][cb].p.user = cache_rsp_meta [cb][j]; - - assign cache_xbar_rsp[j][cb].p.write = cache_rsp_write[cb][j]; + // Spill registers to cut the L1 xbar → coalescer critical path, + // matching the timing budget of the Snitch AMO path above. + tcdm_req_t cache_req_reg; + tcdm_rsp_t cache_rsp_reg; + + spill_register #( + .T ( tcdm_req_chan_t ), + .Bypass ( 1'b0 ) + ) i_spill_reg_cache_req ( + .clk_i , + .rst_ni ( rst_ni ), + .valid_i ( cache_xbar_req[j][cb].q_valid ), + .ready_o ( cache_xbar_rsp[j][cb].q_ready ), + .data_i ( cache_xbar_req[j][cb].q ), + .valid_o ( cache_req_reg.q_valid ), + .ready_i ( cache_rsp_reg.q_ready ), + .data_o ( cache_req_reg.q ) + ); + + spill_register #( + .T ( tcdm_rsp_chan_t ), + .Bypass ( 1'b1 ) + ) i_spill_reg_cache_rsp ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .valid_i ( cache_rsp_reg.p_valid ), + .ready_o ( cache_rsp_ready[cb][j] ), + .data_i ( cache_rsp_reg.p ), + .valid_o ( cache_xbar_rsp[j][cb].p_valid ), + .ready_i ( cache_xbar_pready[j][cb] ), + .data_o ( cache_xbar_rsp[j][cb].p ) + ); + + assign cache_req_valid[cb][j] = cache_req_reg.q_valid; + assign cache_req_addr [cb][j] = cache_req_reg.q.addr; + assign cache_req_meta [cb][j] = cache_req_reg.q.user; + assign cache_req_write[cb][j] = cache_req_reg.q.write; + assign cache_req_data [cb][j] = cache_req_reg.q.data; + assign cache_req_strb [cb][j] = cache_req_reg.q.strb; + + assign cache_rsp_reg.p_valid = cache_rsp_valid[cb][j]; + assign cache_rsp_reg.q_ready = cache_req_ready[cb][j]; + assign cache_rsp_reg.p.data = cache_rsp_data [cb][j]; + assign cache_rsp_reg.p.user = cache_rsp_meta [cb][j]; + assign cache_rsp_reg.p.write = cache_rsp_write[cb][j]; end end From e3c244eefdb21279762defcc286056e81a788a61 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 22 May 2026 09:45:55 +0200 Subject: [PATCH 22/37] [SRC] Clean code. --- config/cachepool_2g.mk | 8 +- config/cachepool_fpu_16g.mk | 8 +- config/cachepool_fpu_2g.mk | 8 +- config/cachepool_fpu_4g.mk | 10 +- config/config.mk | 17 +- hardware/src/cachepool_cc.sv | 9 +- hardware/src/cachepool_cluster.sv | 206 +++++++++----------- hardware/src/cachepool_group.sv | 12 -- hardware/src/cachepool_group_noc_wrapper.sv | 28 +-- hardware/src/cachepool_pkg.sv | 11 +- hardware/src/cachepool_tile.sv | 16 -- hardware/tb/cachepool_cluster_wrapper.sv | 3 - sim/scripts/vsim_cluster.tcl | 2 +- sim/scripts/vsim_core.tcl | 13 +- sim/scripts/vsim_group.tcl | 15 +- sim/scripts/vsim_tile.tcl | 42 ++-- sim/scripts/vsim_wave.tcl | 63 +++--- sim/scripts/vsim_wave_single_tile.tcl | 13 +- software/tests/CMakeLists.txt | 2 + 19 files changed, 229 insertions(+), 257 deletions(-) diff --git a/config/cachepool_2g.mk b/config/cachepool_2g.mk index b58155c..271eedb 100644 --- a/config/cachepool_2g.mk +++ b/config/cachepool_2g.mk @@ -11,6 +11,9 @@ # Number of groups num_groups ?= 2 +# 1×2 mesh +num_groups_x ?= 1 + # Number of tiles num_tiles_per_group ?= 4 @@ -27,6 +30,8 @@ num_remote_ports_per_tile ?= 1 num_rg_ports_per_core ?= 1 +num_noc_ports_per_tile ?= 1 + ###################### ## CachePool Tile ## @@ -40,9 +45,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 diff --git a/config/cachepool_fpu_16g.mk b/config/cachepool_fpu_16g.mk index 125a74d..8cf8445 100644 --- a/config/cachepool_fpu_16g.mk +++ b/config/cachepool_fpu_16g.mk @@ -11,6 +11,9 @@ # Number of groups num_groups ?= 16 +# 4×4 mesh +num_groups_x ?= 4 + # Number of tiles num_tiles_per_group ?= 4 @@ -27,6 +30,8 @@ num_remote_ports_per_tile ?= 1 num_rg_ports_per_core ?= 1 +num_noc_ports_per_tile ?= 2 + ###################### ## CachePool Tile ## @@ -40,9 +45,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 diff --git a/config/cachepool_fpu_2g.mk b/config/cachepool_fpu_2g.mk index df88162..efb23fa 100644 --- a/config/cachepool_fpu_2g.mk +++ b/config/cachepool_fpu_2g.mk @@ -11,6 +11,9 @@ # Number of groups num_groups ?= 2 +# 1×2 mesh +num_groups_x ?= 1 + # Number of tiles num_tiles_per_group ?= 4 @@ -27,6 +30,8 @@ num_remote_ports_per_tile ?= 1 num_rg_ports_per_core ?= 1 +num_noc_ports_per_tile ?= 1 + ###################### ## CachePool Tile ## @@ -40,9 +45,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 diff --git a/config/cachepool_fpu_4g.mk b/config/cachepool_fpu_4g.mk index 3d60410..90a6af8 100644 --- a/config/cachepool_fpu_4g.mk +++ b/config/cachepool_fpu_4g.mk @@ -11,6 +11,9 @@ # Number of groups num_groups ?= 4 +# 2×2 mesh +num_groups_x ?= 2 + # Number of tiles num_tiles_per_group ?= 4 @@ -27,6 +30,8 @@ num_remote_ports_per_tile ?= 1 num_rg_ports_per_core ?= 1 +num_noc_ports_per_tile ?= 2 + ###################### ## CachePool Tile ## @@ -40,9 +45,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -81,7 +83,7 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 16 +l2_channel ?= 8 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 diff --git a/config/config.mk b/config/config.mk index 32ed0f3..1164e89 100644 --- a/config/config.mk +++ b/config/config.mk @@ -29,6 +29,9 @@ include $(CACHEPOOL_DIR)/config/$(config).mk # Number of groups num_groups ?= 1 +# X dimension of the group mesh (Y = num_groups / num_groups_x) +num_groups_x ?= 1 + # Number of tiles num_tiles_per_group ?= 4 num_tiles = $(shell echo $$(( $(num_groups) * $(num_tiles_per_group)))) @@ -41,6 +44,8 @@ num_cores ?= $(shell echo $$(( $(num_tiles) * $(num_cores_per_tile)))) num_rg_ports_per_core ?= 0 +num_noc_ports_per_tile ?= 1 + # Core datawidth data_width ?= 32 @@ -60,9 +65,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -120,6 +122,15 @@ endif ##################### ## L2 Main Memory ## ##################### + +# DRAM base address and size (hex: 0x8000_0000, 0x2000_0000) +dram_addr ?= 2147483648 +dram_len ?= 536870912 + +# Uncached region base address and size (hex: 0xC000_0000, 0x2000_0000) +uncached_addr ?= 3221225472 +uncached_len ?= 536870912 + # L2 number of channels l2_channel ?= 4 diff --git a/hardware/src/cachepool_cc.sv b/hardware/src/cachepool_cc.sv index 86c8d7e..1950cd6 100644 --- a/hardware/src/cachepool_cc.sv +++ b/hardware/src/cachepool_cc.sv @@ -23,12 +23,6 @@ module cachepool_cc parameter int unsigned DataWidth = 0, /// User width of the buses. parameter int unsigned UserWidth = 0, - /// Data width of the AXI DMA buses. - parameter int unsigned DMADataWidth = 0, - /// Id width of the AXI DMA bus. - parameter int unsigned DMAIdWidth = 0, - parameter int unsigned DMAAxiReqFifoDepth = 0, - parameter int unsigned DMAReqFifoDepth = 0, parameter int unsigned SpmStackDepth = 512, /// Data port request type. @@ -75,7 +69,6 @@ module cachepool_cc parameter bit XF16ALT = 0, parameter bit XF8ALT = 0, /// Enable Snitch DMA - parameter bit Xdma = 0, parameter int unsigned NumIntOutstandingLoads = 0, parameter int unsigned NumIntOutstandingMem = 0, parameter int unsigned NumSpatzOutstandingLoads = 0, @@ -176,7 +169,7 @@ module cachepool_cc .VMSupport (1'b0 ), .RVE (RVE ), .FP_EN (FPEn ), - .Xdma (Xdma ), + .Xdma (1'b0 ), .RVF (RVF ), .RVD (RVD ), .RVV (RVV ), diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index 60996e0..c9102ef 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -37,10 +37,6 @@ module cachepool_cluster parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -55,8 +51,6 @@ module cachepool_cluster /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = 0, @@ -237,90 +231,91 @@ module cachepool_cluster assign error_o = |group_error; - for (genvar g = 0; g < NumGroups; g++) begin : gen_group - cachepool_group_noc_wrapper #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NumCoreGroup ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks / NumGroups ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma[g*NumCoreGroup +: NumCoreGroup] ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_group ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( group_error[g] ), - .debug_req_i ( debug_req_i ), - .meip_i ( meip_i ), - .mtip_i ( mtip_i ), - .msip_i ( msip_i ), - .hart_base_id_i ( hart_base_id_i + 10'(g * NumCoreGroup) ), - .tile_base_id_i ( TileIDWidth'(g * NumTilesPerGroup) ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .private_start_addr_i ( private_start_addr ), - .axi_narrow_req_o ( axi_out_req [g*NumTilesPerGroup +: NumTilesPerGroup] ), - .axi_narrow_rsp_i ( axi_out_resp[g*NumTilesPerGroup +: NumTilesPerGroup] ), - // DRAM refill reqrsp (post-xbar, one per L2 channel) - .l2_req_o ( l2_req[g] ), - .l2_rsp_i ( l2_rsp[g] ), - // Peripherals - .icache_events_o ( /* unused */ ), - .icache_prefetch_enable_i ( icache_prefetch_enable ), - .cl_interrupt_i ( cl_interrupt [g*NumCoreGroup +: NumCoreGroup] ), - .dynamic_offset_i ( dynamic_offset ), - .l1d_private_i ( l1d_private ), - .l1d_insn_i ( l1d_insn ), - .l1d_insn_valid_i ( l1d_insn_valid ), - .l1d_insn_ready_o ( l1d_insn_ready[g*NumTilesPerGroup +: NumTilesPerGroup]), - .l1d_busy_i ( l1d_busy [g*NumTilesPerGroup +: NumTilesPerGroup]), - .group_xy_id_i ( group_xy_id_t'{x: g % NumGroupsX, - y: g / NumGroupsX, - port_id: 1'b0} ), - .noc_req_o ( noc_req_out [g] ), - .noc_req_valid_o ( noc_req_out_valid[g] ), - .noc_req_ready_i ( noc_req_out_ready[g] ), - .noc_req_i ( noc_req_in [g] ), - .noc_req_valid_i ( noc_req_in_valid [g] ), - .noc_req_ready_o ( noc_req_in_ready [g] ), - .noc_rsp_o ( noc_rsp_out [g] ), - .noc_rsp_valid_o ( noc_rsp_out_valid[g] ), - .noc_rsp_ready_i ( noc_rsp_out_ready[g] ), - .noc_rsp_i ( noc_rsp_in [g] ), - .noc_rsp_valid_i ( noc_rsp_in_valid [g] ), - .noc_rsp_ready_o ( noc_rsp_in_ready [g] ) - ); + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_group_y + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_group_x + // Flat group index: g = gy * NumGroupsX + gx + localparam int unsigned g = gy * NumGroupsX + gx; + cachepool_group_noc_wrapper #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoreGroup ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks / NumGroups ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_group ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( group_error[g] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id_i + 10'(g * NumCoreGroup) ), + .tile_base_id_i ( TileIDWidth'(g * NumTilesPerGroup) ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .private_start_addr_i ( private_start_addr ), + .axi_narrow_req_o ( axi_out_req [g*NumTilesPerGroup +: NumTilesPerGroup] ), + .axi_narrow_rsp_i ( axi_out_resp[g*NumTilesPerGroup +: NumTilesPerGroup] ), + // DRAM refill reqrsp (post-xbar, one per L2 channel) + .l2_req_o ( l2_req[g] ), + .l2_rsp_i ( l2_rsp[g] ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable ), + .cl_interrupt_i ( cl_interrupt [g*NumCoreGroup +: NumCoreGroup] ), + .dynamic_offset_i ( dynamic_offset ), + .l1d_private_i ( l1d_private ), + .l1d_insn_i ( l1d_insn ), + .l1d_insn_valid_i ( l1d_insn_valid ), + .l1d_insn_ready_o ( l1d_insn_ready[g*NumTilesPerGroup +: NumTilesPerGroup]), + .l1d_busy_i ( l1d_busy [g*NumTilesPerGroup +: NumTilesPerGroup]), + .group_xy_id_i ( group_xy_id_t'{x: gx, + y: gy, + port_id: 1'b0} ), + .noc_req_o ( noc_req_out [g] ), + .noc_req_valid_o ( noc_req_out_valid[g] ), + .noc_req_ready_i ( noc_req_out_ready[g] ), + .noc_req_i ( noc_req_in [g] ), + .noc_req_valid_i ( noc_req_in_valid [g] ), + .noc_req_ready_o ( noc_req_in_ready [g] ), + .noc_rsp_o ( noc_rsp_out [g] ), + .noc_rsp_valid_o ( noc_rsp_out_valid[g] ), + .noc_rsp_ready_i ( noc_rsp_out_ready[g] ), + .noc_rsp_i ( noc_rsp_in [g] ), + .noc_rsp_valid_i ( noc_rsp_in_valid [g] ), + .noc_rsp_ready_o ( noc_rsp_in_ready [g] ) + ); + end end // ---------------------------- @@ -412,8 +407,10 @@ module cachepool_cluster // ------------- // Step 1: Per-group reqrsp_to_axi conversion. - for (genvar g = 0; g < NumGroups; g++) begin : gen_per_group_l2 - for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_per_ch + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_per_group_l2 + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_per_group_l2 + localparam int unsigned g = gy * NumGroupsX + gx; + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_per_ch reqrsp_to_axi #( .MaxTrans ( NumSpatzOutstandingLoads*2 ), .ID ( '0 ), @@ -436,6 +433,7 @@ module cachepool_cluster .axi_req_o ( wide_axi_premux_req[g][ch] ), .axi_rsp_i ( wide_axi_premux_rsp[g][ch] ) ); + end end end @@ -755,30 +753,6 @@ module cachepool_cluster .reg_rsp_i (reg_rsp ) ); - - // Event counter increments for the TCDM. - typedef struct packed { - /// Number requests going in - logic [$clog2(5):0] inc_accessed; - /// Number of requests stalled due to congestion - logic [$clog2(5):0] inc_congested; - } tcdm_events_t; - - // Event counter increments for DMA. - typedef struct packed { - logic aw_stall, ar_stall, r_stall, w_stall, - buf_w_stall, buf_r_stall; - logic aw_valid, aw_ready, aw_done, aw_bw; - logic ar_valid, ar_ready, ar_done, ar_bw; - logic r_valid, r_ready, r_done, r_bw; - logic w_valid, w_ready, w_done, w_bw; - logic b_valid, b_ready, b_done; - logic dma_busy; - axi_pkg::len_t aw_len, ar_len; - axi_pkg::size_t aw_size, ar_size; - logic [$clog2(SpatzAxiNarrowDataWidth/8):0] num_bytes_written; - } dma_events_t; - cachepool_peripheral #( .AddrWidth (AxiAddrWidth ), .SPMWidth ($clog2(L1NumSet)), diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index 1637bff..0a0798d 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -38,10 +38,6 @@ module cachepool_group parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -55,8 +51,6 @@ module cachepool_group /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = 0, @@ -689,12 +683,9 @@ module cachepool_group .axi_narrow_resp_t ( axi_narrow_resp_t ), .axi_out_req_t ( axi_mst_cache_req_t ), .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), .TileIDWidth ( TileIDWidth ), .NumRemoteGroupPortCore ( NumRemoteGroupPortCore ), .NumTilesPerGroup ( NumTilesPerGroup ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), .RegisterOffloadRsp ( RegisterOffloadRsp ), .RegisterCoreReq ( RegisterCoreReq ), .RegisterCoreRsp ( RegisterCoreRsp ), @@ -778,12 +769,9 @@ module cachepool_group .axi_narrow_resp_t ( axi_narrow_resp_t ), .axi_out_req_t ( axi_mst_cache_req_t ), .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), .TileIDWidth ( TileIDWidth ), .NumRemoteGroupPortCore ( NumRemoteGroupPortCore ), .NumTilesPerGroup ( NumTilesPerGroup ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), .RegisterOffloadRsp ( RegisterOffloadRsp ), .RegisterCoreReq ( RegisterCoreReq ), .RegisterCoreRsp ( RegisterCoreRsp ), diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv index ce8d6c3..710038e 100644 --- a/hardware/src/cachepool_group_noc_wrapper.sv +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -28,12 +28,9 @@ module cachepool_group_noc_wrapper parameter int unsigned TCDMDepth = 1024, parameter int unsigned ClusterPeriphSize = 64, parameter int unsigned NrBanks = 2 * NrCores, - parameter int unsigned DMAAxiReqFifoDepth = 3, - parameter int unsigned DMAReqFifoDepth = 3, parameter int unsigned ICacheLineWidth = 0, parameter int unsigned ICacheLineCount = 0, parameter int unsigned ICacheSets = 0, - parameter bit [NrCores-1:0] Xdma = '{default: '0}, parameter fpu_implementation_t FPUImplementation = fpu_implementation_t'(0), parameter int unsigned NumSpatzFPUs = 1, parameter int unsigned NumSpatzIPUs = 1, @@ -154,15 +151,16 @@ module cachepool_group_noc_wrapper for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_mesh_trans_t for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_mesh_trans_n for (genvar d = 0; d < 4; d++) begin : gen_mesh_trans_d - assign noc_req_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out[t][n][d]; - assign noc_req_valid_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out_valid[t][n][d]; + // Mute the channel when not valid for debugging + assign noc_req_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out_valid[t][n][d] ? req_mesh_out[t][n][d] : '0; + assign noc_req_valid_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out_valid[t][n][d]; assign req_mesh_out_ready[t][n][d] = noc_req_ready_i[d][t*NumNoCPortsPerTile+n]; assign req_mesh_in[t][n][d] = noc_req_i[d][t*NumNoCPortsPerTile+n]; assign req_mesh_in_valid[t][n][d] = noc_req_valid_i[d][t*NumNoCPortsPerTile+n]; assign noc_req_ready_o[d][t*NumNoCPortsPerTile+n] = req_mesh_in_ready[t][n][d]; - assign noc_rsp_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out[t][n][d]; - assign noc_rsp_valid_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out_valid[t][n][d]; + assign noc_rsp_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out_valid[t][n][d] ? rsp_mesh_out[t][n][d] : '0; + assign noc_rsp_valid_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out_valid[t][n][d]; assign rsp_mesh_out_ready[t][n][d] = noc_rsp_ready_i[d][t*NumNoCPortsPerTile+n]; assign rsp_mesh_in[t][n][d] = noc_rsp_i[d][t*NumNoCPortsPerTile+n]; assign rsp_mesh_in_valid[t][n][d] = noc_rsp_valid_i[d][t*NumNoCPortsPerTile+n]; @@ -244,6 +242,17 @@ module cachepool_group_noc_wrapper assign mst_xbar_mst_sel[n] = eject_rsp[noc_port].hdr.src_port_id; end + // Static port-to-NoC-channel mapping: each flat port p has xbar index + // j = p % NrTCDMPortsPerCore, and is steered to NoC channel j % NumNoCPortsPerTile. + // Spatz ports (j=0..NrTCDMPortsPerCore-2) divide evenly across channels; + // Snitch (j=NrTCDMPortsPerCore-1) maps by the same modulo. + localparam int unsigned NocMstSelWidth = (NumNoCPortsPerTile > 1) + ? $clog2(NumNoCPortsPerTile) : 1; + logic [NumRemoteGroupPortTile-1:0][NocMstSelWidth-1:0] noc_mst_sel; + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_noc_mst_sel + assign noc_mst_sel[p] = NocMstSelWidth'((p % NrTCDMPortsPerCore) % NumNoCPortsPerTile); + end + reqrsp_xbar #( .NumInp ( NumRemoteGroupPortTile ), .NumOut ( NumNoCPortsPerTile ), @@ -259,7 +268,7 @@ module cachepool_group_noc_wrapper .slv_rsp_o ( mst_slv_rsp ), .slv_rsp_valid_o ( mst_slv_rsp_valid ), .slv_rsp_ready_i ( mst_slv_rsp_ready ), - .slv_sel_i ( '0 ), + .slv_sel_i ( noc_mst_sel ), .slv_selected_o ( mst_xbar_slv_selected ), .mst_req_o ( mst_xbar_req[t*NumNoCPortsPerTile +: NumNoCPortsPerTile] ), @@ -544,9 +553,6 @@ module cachepool_group_noc_wrapper .axi_narrow_resp_t ( axi_narrow_resp_t ), .axi_out_req_t ( axi_out_req_t ), .axi_out_resp_t ( axi_out_resp_t ), - .Xdma ( Xdma ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), .RegisterOffloadRsp ( RegisterOffloadRsp ), .RegisterCoreReq ( RegisterCoreReq ), .RegisterCoreRsp ( RegisterCoreRsp ), diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index 6dd9af7..ab08758 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -285,8 +285,6 @@ package cachepool_pkg; // Wide AXI ports: X to DRAM (X=4 for now) localparam int unsigned ClusterWideOutAxiPorts = NumL2Channel; - // TODO: multi-tile support - // One more from the Snitch core ////////////////// // L2 / DRAM // @@ -476,6 +474,15 @@ package cachepool_pkg; noc_group_hdr_t hdr; } noc_group_rsp_t; + // Group ICache (L2 read-only cache control) + localparam int unsigned ROCacheNumAddrRules = 4; + typedef struct packed { + logic enable; + logic flush_valid; + logic [ROCacheNumAddrRules-1:0][AddrWidth-1:0] start_addr; + logic [ROCacheNumAddrRules-1:0][AddrWidth-1:0] end_addr; + } ro_cache_ctrl_t; + ///////////////////// // CLUSTER TYPES // diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index eb86651..24adf18 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -40,10 +40,6 @@ module cachepool_tile parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -58,8 +54,6 @@ module cachepool_tile /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// Tile ID Width parameter int unsigned TileIDWidth = 0, /// Number of dedicated inter-group remote ports per xbar plane. @@ -1385,15 +1379,10 @@ module cachepool_tile .RVF (RVF ), .RVD (RVD ), .RVV (RVV ), - .Xdma (Xdma[i] ), .AddrWidth (AxiAddrWidth ), .DataWidth (NarrowDataWidth ), .UserWidth (AxiUserWidth ), - .DMADataWidth (AxiDataWidth ), - .DMAIdWidth (WideIdWidthIn ), .SnitchPMACfg (SnitchPMACfg ), - .DMAAxiReqFifoDepth (DMAAxiReqFifoDepth ), - .DMAReqFifoDepth (DMAReqFifoDepth ), .dreq_t (reqrsp_req_t ), .drsp_t (reqrsp_rsp_t ), .dreq_chan_t (reqrsp_req_chan_t ), @@ -1701,12 +1690,7 @@ module cachepool_tile // ------------- // Sanity Checks // ------------- - // Sanity check the parameters. Not every configuration makes sense. - `ASSERT_INIT(CheckSuperBankSanity, NrBanks >= BanksPerSuperBank); - `ASSERT_INIT(CheckSuperBankFactor, (NrBanks % BanksPerSuperBank) == 0); // Check that the cluster base address aligns to the TCDMSize. `ASSERT(ClusterBaseAddrAlign, ((TCDMSize - 1) & cluster_base_addr_i) == 0) - // Make sure we only have one DMA in the system. - `ASSERT_INIT(NumberDMA, $onehot0(Xdma)) endmodule diff --git a/hardware/tb/cachepool_cluster_wrapper.sv b/hardware/tb/cachepool_cluster_wrapper.sv index cfac974..e412f7c 100644 --- a/hardware/tb/cachepool_cluster_wrapper.sv +++ b/hardware/tb/cachepool_cluster_wrapper.sv @@ -98,9 +98,6 @@ module cachepool_cluster_wrapper // Cluster internally uses the fat output type; the wrapper remaps it. .axi_out_req_t (spatz_axi_out_req_t ), .axi_out_resp_t (spatz_axi_out_resp_t ), - .Xdma (4'h0 ), - .DMAAxiReqFifoDepth (3 ), - .DMAReqFifoDepth (3 ), .RegisterOffloadRsp (1 ), .RegisterCoreReq (1 ), .RegisterCoreRsp (1 ), diff --git a/sim/scripts/vsim_cluster.tcl b/sim/scripts/vsim_cluster.tcl index e34f008..2213a16 100644 --- a/sim/scripts/vsim_cluster.tcl +++ b/sim/scripts/vsim_cluster.tcl @@ -5,7 +5,7 @@ # Create group for Cluster onerror {resume} -set cluster_path $1 +quietly set cluster_path $1 add wave -noupdate -group Cluster -group CSR ${cluster_path}/i_cachepool_cluster_peripheral/* diff --git a/sim/scripts/vsim_core.tcl b/sim/scripts/vsim_core.tcl index 4021c22..30ee61a 100644 --- a/sim/scripts/vsim_core.tcl +++ b/sim/scripts/vsim_core.tcl @@ -6,13 +6,16 @@ onerror {resume} quietly WaveActivateNextPane {} 0 -set core_path ${4} -set name g_${1}_t_${2}_c_${3} +quietly set core_path ${4} +quietly set name g_${1}_t_${2}_c_${3} -# Safely handle the optional 5th argument for nesting -set parent_grp [list] +# Build the parent group prefix list from optional args 5 (GroupWP) and 6 (tile) +quietly set parent_grp [list] if {$argc > 4 && "${5}" != ""} { - set parent_grp [list -group ${5}] + quietly lappend parent_grp -group ${5} +} +if {$argc > 5 && "${6}" != ""} { + quietly lappend parent_grp -group ${6} } # The {*} syntax safely expands the list. diff --git a/sim/scripts/vsim_group.tcl b/sim/scripts/vsim_group.tcl index f247f7e..876c4b2 100644 --- a/sim/scripts/vsim_group.tcl +++ b/sim/scripts/vsim_group.tcl @@ -2,22 +2,21 @@ # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 -# Create group for Tile $1 onerror {resume} -set group_path $1 +quietly set group_path $1 +quietly set parent_grp $3 # Add waves for remote xbar for {set p 0} {$p < $2} {incr p} { onerror {resume} - set xbar_path ${group_path}/gen_remote_tile_xbar[$p]/i_tile_remote_xbar + quietly set xbar_path ${group_path}/gen_remote_tile_xbar[$p]/i_tile_remote_xbar - add wave -noupdate -group Group -group remote_xbar[$p] ${xbar_path}/* + add wave -noupdate -group "${parent_grp}" -group remote_xbar[$p] ${xbar_path}/* } -add wave -noupdate -group Group -group refill_xbar -group req_xbar ${group_path}/i_refill_xbar/i_req_xbar/* -add wave -noupdate -group Group -group refill_xbar -group rsp_xbar ${group_path}/i_refill_xbar/i_rsp_xbar/* +add wave -noupdate -group "${parent_grp}" -group refill_xbar -group req_xbar ${group_path}/i_refill_xbar/i_req_xbar/* +add wave -noupdate -group "${parent_grp}" -group refill_xbar -group rsp_xbar ${group_path}/i_refill_xbar/i_rsp_xbar/* - -add wave -noupdate -group Group -group Internal ${group_path}/* +add wave -noupdate -group "${parent_grp}" -group Internal ${group_path}/* diff --git a/sim/scripts/vsim_tile.tcl b/sim/scripts/vsim_tile.tcl index 09e5938..5a9565e 100644 --- a/sim/scripts/vsim_tile.tcl +++ b/sim/scripts/vsim_tile.tcl @@ -5,38 +5,44 @@ # Create group for Tile $1 onerror {resume} -set tile_path $3 +quietly set tile_path $3 +quietly set parent_grp $4 + +# --- Configuration Variables --- +# NrTCDMPortsPerCore: 4 Spatz ports + 1 Snitch port +quietly set NUM_XBARS 5 +quietly set SNITCH_IDX [expr {$NUM_XBARS - 1}] # Add waves for tcdm_mapper and csrs -# add wave -noupdate -group group[$2] -group tile[$1] -group Barrier ${tile_path}/i_tile/i_snitch_barrier/* -# add wave -noupdate -group group[$2] -group tile[$1] -group axi2reqrsp ${tile_path}/i_axi2reqrsp/* +# add wave -noupdate -group ${parent_grp} -group tile[$1] -group Barrier ${tile_path}/i_tile/i_snitch_barrier/* +# add wave -noupdate -group ${parent_grp} -group tile[$1] -group axi2reqrsp ${tile_path}/i_axi2reqrsp/* # Add waves for xbars -add wave -noupdate -group group[$2] -group tile[$1] -group narrow_xbar ${tile_path}/i_tile/i_axi_narrow_xbar/* -add wave -noupdate -group group[$2] -group tile[$1] -group wide_xbar ${tile_path}/i_tile/i_axi_wide_xbar/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group narrow_xbar ${tile_path}/i_tile/i_axi_narrow_xbar/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group wide_xbar ${tile_path}/i_tile/i_axi_wide_xbar/* -add wave -noupdate -group Barrier -group group[$2] -group tile[$1] ${tile_path}/i_tile/i_cachepool_tile_barrier/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group Barrier ${tile_path}/i_tile/i_cachepool_tile_barrier/* # Add waves for cache controller for {set c 0} {$c < 4} {incr c} { onerror {resume} - set cache_path ${tile_path}/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller + quietly set cache_path ${tile_path}/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller - add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group amo ${tile_path}/i_tile/gen_cache_connect[$c]/gen_cache_amo_connect[4]/gen_amo/i_cache_amo/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group amo ${tile_path}/i_tile/gen_cache_connect[$c]/gen_cache_amo_connect[${SNITCH_IDX}]/gen_amo/i_cache_amo/* - add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group coalescer ${cache_path}/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* - add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group core ${cache_path}/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* - add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group meta_ctrl0 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[0]/i_access_ctrl_for_meta/* - add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group meta_ctrl1 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[1]/i_access_ctrl_for_meta/* - add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group meta_ctrl2 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[2]/i_access_ctrl_for_meta/* - add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group meta_ctrl3 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[3]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group coalescer ${cache_path}/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group core ${cache_path}/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl0 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[0]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl1 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[1]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl2 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[2]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl3 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[3]/i_access_ctrl_for_meta/* - add wave -noupdate -group group[$2] -group tile[$1] -group cache[$c] -group Internal ${cache_path}/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group Internal ${cache_path}/* } -for {set c 0} {$c < 5} {incr c} { - add wave -noupdate -group group[$2] -group tile[$1] -group cache_xbar -group xbar[$c] ${tile_path}/i_tile/gen_cache_xbar[$c]/gen_remote_group_slice/i_cache_xbar/* +for {set c 0} {$c < $NUM_XBARS} {incr c} { + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache_xbar -group xbar[$c] ${tile_path}/i_tile/gen_cache_xbar[$c]/gen_remote_group_slice/i_cache_xbar/* } # Add waves for remaining signals -add wave -noupdate -group group[$2] -group tile[$1] -group Internal ${tile_path}/i_tile/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group Internal ${tile_path}/i_tile/* diff --git a/sim/scripts/vsim_wave.tcl b/sim/scripts/vsim_wave.tcl index 723184b..a5f9347 100644 --- a/sim/scripts/vsim_wave.tcl +++ b/sim/scripts/vsim_wave.tcl @@ -6,9 +6,11 @@ onerror {resume} quietly WaveActivateNextPane {} 0 # --- Configuration Variables --- -set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster -set NUM_GROUPS 4 ;# Change this variable to match your total number of groups -set NUM_CORES 4 ;# Assuming 4 cores per tile based on original script +quietly set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster +quietly set NUM_GROUPS 4 ;# Total number of groups +quietly set NUM_GROUPS_X 2 ;# X dimension of group mesh (NUM_GROUPS_Y = NUM_GROUPS / NUM_GROUPS_X) +quietly set NUM_TILES 4 ;# Tiles per group +quietly set NUM_CORES 4 ;# Cores per tile # Add the cluster probe add wave /tb_cachepool/cluster_probe @@ -16,43 +18,34 @@ add wave /tb_cachepool/cluster_probe # Cluster do sim/scripts/vsim_cluster.tcl ${cluster_path} -# Iterate through all groups +# Iterate through all groups using 2D coordinates for {set g 0} {$g < $NUM_GROUPS} {incr g} { - set group_wp_path ${cluster_path}/gen_group[$g]/i_group - set group_path ${group_wp_path}/i_group - - # 1. Plot all GroupWP levels of all groups - add wave -noupdate -group "GroupWP_$g" ${group_wp_path}/* - - do sim/scripts/vsim_group.tcl ${group_path} 5 - - # Conditional plotting based on the group - if {$g <= 1} { - # 2. Call to plot tile 0 and tile 3 for Group 0 only - foreach tile {0 1 2 3} { - set tile_path ${group_path}/gen_tiles[$tile]/gen_tile - do sim/scripts/vsim_tile.tcl $tile $g ${tile_path} - - # 3. Plot all cores in the plotted tile + quietly set gx [expr {$g % $NUM_GROUPS_X}] + quietly set gy [expr {$g / $NUM_GROUPS_X}] + quietly set group_wp_path ${cluster_path}/gen_group_y[${gy}]/gen_group_x[${gx}]/i_group + quietly set group_path ${group_wp_path}/i_group + quietly set gwp_name "GroupWP_x${gx}_y${gy}" + + # 1. Plot GroupWP signals for this group (always, all groups) + add wave -noupdate -group "${gwp_name}" ${group_wp_path}/* + + # 2. Plot Group-level signals nested inside GroupWP (always, all groups) + do sim/scripts/vsim_group.tcl ${group_path} 5 "${gwp_name}" + + # 3. Plot all tiles and cores for the diagonal groups: (0,0) always, + # and (1,1) if the mesh has at least 2 columns and 2 rows + if {($gx == 0 && $gy == 0) || ($gx == 1 && $gy == 1 && $NUM_GROUPS_X >= 2)} { + for {set tile 0} {$tile < $NUM_TILES} {incr tile} { + quietly set tile_path ${group_path}/gen_tiles[${tile}]/gen_tile + do sim/scripts/vsim_tile.tcl $tile $g ${tile_path} "${gwp_name}" + + # 4. Plot all cores grouped under their tile for {set core 0} {$core < $NUM_CORES} {incr core} { - set core_path ${tile_path}/i_tile/gen_core[$core] - # Pass an empty string to indicate NO parent group - do sim/scripts/vsim_core.tcl $g $tile $core ${core_path} "" + quietly set core_path ${tile_path}/i_tile/gen_core[${core}] + do sim/scripts/vsim_core.tcl $g $tile $core ${core_path} "${gwp_name}" "tile[${tile}]" } } - } else { - # 4. Plot core 0 in tile 0 of other groups - set tile 0 - set core 0 - set tile_path ${group_path}/gen_tiles[$tile]/gen_tile - set core_path ${tile_path}/i_tile/gen_core[$core] - - do sim/scripts/vsim_core.tcl $g $tile $core ${core_path} "GroupWP_$g" } - # set group_wp_path ${cluster_path}/gen_group[1]/i_group - # set group_path ${group_wp_path}/i_group - # set tile_path ${group_path}/gen_tiles[2]/gen_tile - # do sim/scripts/vsim_tile.tcl 2 ${tile_path} } # Add DRAM waves once at the end diff --git a/sim/scripts/vsim_wave_single_tile.tcl b/sim/scripts/vsim_wave_single_tile.tcl index 28e54e5..125d849 100644 --- a/sim/scripts/vsim_wave_single_tile.tcl +++ b/sim/scripts/vsim_wave_single_tile.tcl @@ -5,9 +5,10 @@ onerror {resume} quietly WaveActivateNextPane {} 0 -set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster -set group_path ${cluster_path} -set tile_path ${group_path}/gen_tile +quietly set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster +quietly set group_wp_path ${cluster_path}/gen_group[0][0]/i_group +quietly set group_path ${group_wp_path}/i_group +quietly set tile_path ${group_path}/gen_tiles[0]/gen_tile # Add the cluster probe @@ -15,11 +16,11 @@ add wave /tb_cachepool/cluster_probe do sim/scripts/vsim_cluster.tcl ${cluster_path} -do sim/scripts/vsim_tile.tcl 0 ${tile_path} +do sim/scripts/vsim_tile.tcl 0 0 ${tile_path} # Add all cores in Tile 0 for {set core 0} {$core < 4} {incr core} { - set core_path ${tile_path}/i_tile/gen_core[$core] - do sim/scripts/vsim_core.tcl 0 $core ${core_path} + quietly set core_path ${tile_path}/i_tile/gen_core[$core] + do sim/scripts/vsim_core.tcl 0 0 $core ${core_path} "" } for {set ch 0} {$ch < 4} {incr ch} { diff --git a/software/tests/CMakeLists.txt b/software/tests/CMakeLists.txt index c090359..a3e85e5 100644 --- a/software/tests/CMakeLists.txt +++ b/software/tests/CMakeLists.txt @@ -123,3 +123,5 @@ add_spatz_test_oneParam(idotp-32b idotp-32b/main.c 32768) add_spatz_test_oneParam(load-store load-store/main.c 16) + +add_spatz_test_zeroParam(bandwidth bandwidth/main.c) From d6639409a35607b27936c3b59a88b54fe957afe3 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 22 May 2026 09:46:16 +0200 Subject: [PATCH 23/37] [SW] Add bandwidth test. --- software/tests/bandwidth/data/data.h | 336 ++++++++++++++++++++ software/tests/bandwidth/main.c | 157 +++++++++ software/tests/bandwidth/script/bw.json | 17 + software/tests/bandwidth/script/gen_data.py | 127 ++++++++ 4 files changed, 637 insertions(+) create mode 100644 software/tests/bandwidth/data/data.h create mode 100644 software/tests/bandwidth/main.c create mode 100644 software/tests/bandwidth/script/bw.json create mode 100644 software/tests/bandwidth/script/gen_data.py diff --git a/software/tests/bandwidth/data/data.h b/software/tests/bandwidth/data/data.h new file mode 100644 index 0000000..c5be45f --- /dev/null +++ b/software/tests/bandwidth/data/data.h @@ -0,0 +1,336 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// This file was generated automatically by script/gen_data.py. + +#include + +const uint32_t M = 4096; +const uint32_t R = 8; + +static int data_dram[4096] __attribute__((section(".data"))) = { + 2, 79, -8, -86, 6, -29, 88, -80, 2, 21, -26, -13, 16, -1, 3, 51, + 30, 49, -48, -99, -13, 57, -63, 29, 91, 87, -80, 60, -43, -79, -12, -52, + -42, 69, 87, -86, 89, 89, 74, 89, -50, 7, -46, -37, 30, -50, 34, -80, + -28, 66, -83, 31, -12, -41, -87, -92, -11, -48, 29, -17, -9, 10, 87, 98, + 71, -93, 74, -66, -20, 63, -51, 3, 31, -99, 33, -47, 5, -97, -47, 90, + 45, -57, 61, 89, -87, -6, -53, -86, 99, 89, -61, -19, 10, -48, -77, 53, + 87, 23, -60, 56, -86, -56, -36, -12, -30, -92, -13, 28, 35, -38, 38, -20, + 35, 62, 62, -68, 22, -96, -60, -73, 34, -29, -89, 61, -68, -53, 50, -39, + -64, -2, 71, 3, -66, 92, 0, 74, 30, -100, -96, 41, 2, -74, 36, -86, + -11, -59, 23, 78, -38, -5, -49, -5, 31, 50, 42, 70, -72, -65, -88, 59, + -30, 86, -15, -73, -35, 69, -56, -39, 84, 33, -73, -73, 7, -57, -17, -71, + 89, -26, 27, -9, 89, 28, 20, -74, 89, 20, 15, -98, 2, 97, 99, 54, + 36, -39, 64, -50, 71, 51, -42, 17, 59, -5, 79, 12, -39, 85, -49, -89, + -62, 29, 30, 12, 0, 12, 83, -20, 86, 12, -99, 29, -47, -14, 28, 46, + 25, 29, -48, 71, 59, 97, 59, -33, 82, 83, 22, 44, -63, -77, -32, 15, + -3, 97, 38, 43, -4, 23, 86, -31, -8, -98, 47, 86, 63, 46, -11, 94, + 46, 47, -5, 98, -49, 60, 67, 27, -62, -19, 3, 28, -90, 84, 77, 50, + 58, -59, -2, -94, 43, -11, 11, -41, 12, -99, 28, -53, 39, 96, -64, 59, + -92, -2, 46, -53, 30, 47, 51, -47, 19, 60, 51, 15, -26, 12, 99, 63, + 65, 3, -17, 11, -2, 52, -8, 45, 27, 9, -19, 93, -47, 62, 88, 68, + 60, -33, -68, 41, -80, -53, 47, 27, 35, 34, 94, 44, 27, -68, 75, 86, + 14, 18, -79, 57, -63, 8, -50, 81, -93, -74, -74, -80, -71, -4, -73, 10, + 91, 96, -40, -53, 46, -97, -66, 91, -52, -84, 71, 57, -55, 16, -95, -2, + 23, -64, -77, -8, -55, 80, -6, -2, 87, 15, 90, 59, 60, -34, 27, -83, + -76, -47, -43, -34, 3, 73, -77, 13, -69, 74, -15, 50, 93, 26, 54, 29, + -84, 3, 60, 36, -58, 75, -62, 69, -75, -2, -51, 52, 51, -88, -41, 34, + -44, -65, 72, -81, -36, -93, 43, 41, 14, 42, -9, -3, -35, -69, 90, -15, + -50, 52, 85, -38, 89, 24, 49, -43, -43, -15, -52, 79, 69, -31, -86, -47, + 87, 0, -93, -48, -41, 7, -96, 2, 95, -95, 8, 15, -7, -54, -2, -46, + 67, -49, 43, -88, 13, 23, 5, 57, 46, 44, 19, -38, -82, -9, -43, 82, + -11, 16, -39, -78, 26, 36, 39, 28, -43, 21, -100, -67, -5, 25, 17, -53, + -12, 16, 28, -85, 88, 91, 90, -32, -79, -8, 94, -25, 53, 43, 78, -15, + 84, -72, -32, -54, -7, 89, 96, 43, 75, -16, -62, -1, -68, 0, -78, -91, + -32, -1, -67, 79, 37, 46, 85, -5, -100, -32, -97, -85, -77, -21, -99, 27, + 59, -17, 51, 39, 77, 62, 23, -68, 60, 88, 78, 70, 0, -89, -34, -36, + 60, 67, -27, -58, -57, -72, 40, -89, -6, -55, 29, -66, -20, -11, -93, -8, + 53, -11, 61, 14, 4, 34, 95, -43, 13, -26, 56, 19, 63, -80, 63, 37, + 0, 51, 91, 76, -2, -65, -5, 51, 50, 89, -64, -89, 12, 82, -88, -78, + -2, 4, -71, -84, 12, -39, -17, 11, -15, 40, 86, -82, 76, -1, 39, 88, + 46, -25, -92, 98, -73, 27, -49, -18, 10, 43, 16, -32, -2, 39, -76, 79, + 22, -48, 50, 43, -44, -62, 8, 80, -59, 85, 66, 41, 21, 32, 62, -26, + 45, -25, -92, -27, 85, 44, -94, 73, 40, 67, 69, 36, 77, -74, 21, 93, + -96, -72, 64, 65, -18, 35, 8, -36, -15, 44, -30, -56, 31, -65, -31, 58, + -82, 88, 7, 81, 66, -10, -11, -82, -62, 25, 94, 72, 40, 25, -43, 47, + 99, -40, 26, 4, 66, -100, 30, -9, 89, 90, 12, 52, -45, 60, 65, 16, + 33, -43, -57, 72, 59, 72, -40, -54, 48, -21, 17, 63, 46, -81, 84, 45, + -54, -52, -87, 42, 58, -100, 16, -47, 17, -98, 43, 84, -89, -27, -85, 99, + 1, 51, 55, 16, -93, 21, -9, 63, -11, 35, 85, -41, 77, -73, 0, -60, + 91, 54, 90, 44, 60, 56, 40, -55, -66, 33, -19, 14, 96, -54, 52, 93, + -91, -45, -71, 8, -96, 18, -68, 17, -36, 45, 76, -90, -16, -75, -38, -15, + -42, -74, 76, 60, -3, 4, -2, 28, 48, -46, -95, 96, -6, 32, 1, -98, + 80, -78, -48, 64, -18, 44, -16, -23, 9, -100, -50, 72, -97, 89, 92, 12, + -69, -67, -9, -6, -29, -62, 53, 61, 81, 17, -98, 22, -51, -89, 92, -47, + 32, -44, 44, 11, -54, 50, -16, 41, -35, -26, 78, 2, 65, 91, -63, -51, + -3, -19, -71, -22, -10, 78, 90, -49, 65, -22, -71, 5, -50, -20, 32, -72, + 31, 37, 83, 44, -27, -84, -17, -32, -67, -95, -48, 93, 25, -58, 14, 10, + 50, 82, -21, -6, 17, 43, -93, 31, 3, 31, 83, -76, 94, -5, 94, 54, + -8, 59, 77, -40, 21, -50, 46, -80, -96, -9, 69, -40, -79, 48, -31, -100, + 32, -89, -11, -55, -67, 76, -23, -56, 54, -28, -75, -54, 20, -45, -7, 6, + -38, -53, -40, -20, -75, -65, -100, -93, 12, -2, 79, -54, 26, -45, -87, -73, + -23, 29, 8, 53, -87, 86, -45, 14, -94, -98, 10, 50, 6, -83, -63, 14, + -86, 91, 18, -73, -62, 84, -84, -15, 25, -57, -76, 44, -88, -76, -33, 37, + -34, 8, 45, 10, 10, -67, 10, -93, 67, 12, -18, -59, 68, 0, -95, 79, + -75, -37, 86, 83, -42, 8, 97, 20, -68, 80, 49, -80, 97, -31, 11, -97, + -7, -26, 89, -39, -7, -6, 51, -46, 36, 30, -70, -61, -65, 51, -95, -35, + -26, -97, -22, 33, 17, 12, -7, 78, -39, 84, 93, -22, 35, -75, 78, 72, + -57, 32, -31, 53, 17, -33, -82, -81, 12, 39, -54, -100, -11, 41, -37, -63, + -64, 25, 38, -1, -24, -98, 60, 33, 77, -91, -96, 50, 37, 71, 29, -88, + 67, 29, -17, -36, -38, 0, -28, -84, -92, 42, 51, -63, 62, -7, -6, -52, + -32, -39, 87, 77, -23, 16, 19, 36, 61, -25, -66, -100, -61, -37, 49, -41, + -37, -8, -29, -90, 4, -87, -41, -71, -66, 64, -96, -18, 22, -23, 53, -39, + 31, -12, -59, -12, 45, -61, -29, -62, 3, -87, -69, 78, 24, 65, -4, -78, + -38, 42, 52, 44, 93, -48, 3, 3, 78, -62, 78, 97, 33, -34, 34, 78, + 99, -59, 91, -86, 10, 56, -68, 54, 63, 56, -63, 84, -4, 54, 82, -68, + 95, -15, -35, -91, -96, 22, -27, -4, 17, -63, 40, 58, -54, 8, 79, 83, + -86, 56, 35, -96, 56, -54, -33, -25, 72, 29, 54, -6, 14, 63, -65, -75, + -58, -74, -32, 47, -90, -27, 3, -63, 33, 99, -78, -54, -11, -55, 6, 39, + 40, 3, 89, 10, 3, -41, 70, -1, -33, 32, 19, -64, -29, 26, 5, -9, + -70, 21, 36, -50, -72, -61, -60, -90, 50, -100, -55, 0, 48, -11, 9, 63, + 81, -44, -100, -38, 81, -46, 14, 67, 42, 48, 74, -48, 36, 79, -44, 16, + 53, 68, -66, -38, -76, -11, -26, -63, 29, 34, 61, -10, -84, 27, 70, 86, + 7, 78, 81, 51, -76, 98, 79, 97, -13, 60, 76, -72, -38, 49, 53, 55, + 13, -52, 98, -20, -17, 4, 76, 47, 17, -15, -38, 88, 3, -52, -30, -100, + 40, 78, 22, 83, -39, -69, 57, -72, -52, 72, -71, 43, 67, -82, -83, -100, + -23, 14, 74, 93, 12, 13, 65, 78, -38, 31, 28, -93, 56, -46, 2, 30, + -69, -91, 9, 61, 23, -14, -46, -69, 77, 34, -8, -93, -36, -44, -34, 86, + -29, -47, -34, -50, -4, -9, -93, 61, -66, -5, -13, -23, -69, 19, -55, 43, + 95, 64, 81, 41, -46, 75, -94, -27, -94, 60, 50, -16, 46, 11, 24, 46, + -65, -72, 87, -19, -99, 28, -54, 96, 47, -90, 29, 94, 18, 39, -81, -96, + -64, 65, 36, -48, 71, -2, -15, 7, -77, -20, 57, 19, -42, 27, 13, 14, + -87, -92, -61, 93, -76, 2, -79, -97, 53, 85, -72, 8, -3, -64, -26, -31, + 45, 27, -16, -59, -1, -60, 65, -67, -84, 64, 52, 21, -25, -14, 16, 5, + -15, -74, -44, -70, 33, 67, -3, -89, -48, 98, 37, 72, 44, 53, 89, 21, + -55, -37, 29, -47, 6, 92, -50, 80, -65, 14, -75, -72, 48, -90, -61, -90, + 63, 12, 86, -62, -47, -3, -46, 10, 51, 49, -32, -45, -68, -65, 47, -20, + 89, 8, 53, 93, -28, -61, -84, 28, -40, 70, 3, -59, -76, -62, -66, 30, + 7, 71, 78, 1, -3, -89, -82, 0, -57, 86, -52, 88, -84, -27, -44, -46, + -54, -89, -39, -21, -18, -93, 48, -20, 17, 97, 99, -76, -12, -89, 42, -42, + 53, 8, -75, -54, -69, -91, -85, 98, 44, 50, -75, -15, 34, -87, -22, 16, + -94, 36, 14, 75, -29, -42, -19, 15, 66, 45, 86, 2, -84, -87, 58, 51, + 7, 87, -56, -3, 30, -64, 70, 67, 82, 7, -62, -86, 19, -97, -8, 52, + 40, 60, -85, 69, 7, 25, 93, -46, 69, -67, 57, -88, 40, -83, 59, -5, + 22, -62, -55, -72, 89, -39, 84, -85, 83, 20, -91, 57, 52, 2, -17, 32, + -36, 76, -98, 72, 41, 19, 57, -33, 45, 89, -64, 52, 75, 92, 80, -28, + 42, -52, 20, -13, 95, 39, 86, -64, -40, 70, 23, 97, -62, 83, -38, -55, + -13, -90, -39, -24, 52, 98, -49, -97, -42, -29, 47, -8, -38, -47, 16, 9, + -27, 6, -44, 68, -98, 19, -95, -96, -96, -47, -54, 76, -92, -2, -81, 21, + -40, 22, 62, 77, -19, -39, 44, -98, 59, -88, 19, 24, -12, -28, -33, 41, + 0, -3, -90, 4, 83, -34, 22, -72, 36, 94, 52, -25, 69, -92, -61, -76, + -96, 38, 86, 23, -63, -29, 32, 75, -11, 45, 97, -64, -41, 75, -73, 96, + -20, 13, -61, -56, -39, -8, 14, 85, -34, -45, -61, -3, -61, -10, -68, -95, + 46, -21, 67, -9, -4, -42, -42, -53, 68, -88, 6, -56, -12, 80, 69, -1, + 90, 54, 65, 87, -94, 32, -56, -43, -70, -78, 69, 37, 59, 6, 80, 57, + -62, 58, -19, -1, -99, -24, -89, -62, 93, -88, -55, 54, 12, -84, 3, -3, + 87, 91, -13, 8, 62, -31, 57, 98, -27, -38, 55, -41, -22, -64, -79, 95, + 47, -5, -33, 91, -84, -89, 49, 44, 37, 17, 92, -79, 19, -51, -87, 91, + 74, -72, 63, 79, 24, -94, -72, 38, -68, 33, 10, -22, 59, 78, -43, -75, + 73, -72, -97, -2, 40, -62, -86, 18, -72, 56, -26, 15, -69, 29, 74, -17, + -79, -74, 94, -20, 25, 83, -9, -80, -54, 0, 86, -17, -95, 46, 84, 96, + 30, -100, -43, 23, 65, -86, 61, 88, 62, -37, -76, -77, -89, -83, -86, -74, + -29, -41, 73, 44, -8, -48, -93, -22, -74, 73, -51, 98, 30, -8, 91, 71, + 80, -77, -41, 21, 30, 47, -55, -86, 52, 98, -84, 25, -17, -92, 65, 75, + 87, -38, 16, 29, -13, -29, 38, -37, -29, -80, -83, -37, -19, 9, -6, 65, + -69, -90, -56, -68, 1, 68, 35, -90, 78, -13, -60, 44, -25, 20, -55, 59, + -21, -47, -15, -9, 47, 60, 67, 59, -4, -99, -68, 57, 60, -24, 15, 53, + 49, 2, 93, 65, 2, -11, 73, 21, 42, 5, 18, 71, -52, 3, -40, 94, + 22, 33, -92, -95, -28, 2, 59, -60, 13, -93, -29, 77, -9, -39, -17, -39, + 10, 26, -94, -17, 31, 33, -43, -79, 53, 26, -98, 26, -60, 87, 5, -87, + -89, -14, -89, 40, -76, -56, 21, 46, -46, -1, 72, 35, 18, -48, 82, -69, + -50, -57, -19, 97, 45, 49, -64, -5, -45, 4, -42, 30, 55, -66, 88, -8, + 81, 90, -4, 34, -25, -52, -7, 58, 85, 88, -77, 47, -64, -97, 7, 25, + -81, 87, 51, -51, 76, 78, -5, 4, 63, 16, -3, -35, -22, 61, 63, -95, + -62, 69, 14, 85, 24, -89, 30, -83, -61, -80, 42, -2, -75, -93, -1, -14, + -59, 71, -87, -23, -99, 18, -64, -32, 65, -60, -39, -41, 24, 10, -87, -17, + 29, -31, 44, 22, -12, -73, -35, 62, -96, 92, 67, 83, 63, -56, 50, -54, + -70, 38, -91, 94, 17, -87, -21, -8, -91, -25, -7, 4, -100, 32, -81, -93, + 70, -100, -44, -37, 17, -73, 20, 6, 64, 74, 64, -52, 19, -57, -45, 55, + 10, 33, 7, 88, 47, 13, -23, 29, -64, -81, -75, -72, -100, 18, -66, 97, + -42, -32, 60, 78, -89, -81, 80, -58, 25, -51, -72, 84, -62, 83, 70, 77, + 50, 71, -63, -17, 50, 27, 25, -56, -49, -21, 77, 23, 18, -93, -16, 28, + -93, 56, -5, 1, -96, -15, 74, -67, 92, -46, 18, 70, 57, 13, 44, -75, + 35, -54, 12, -25, -29, -12, -55, 95, -68, -100, 46, -70, 41, -48, -53, -32, + 4, -11, 89, 93, 91, 97, -38, -28, 60, 68, -12, 75, -88, -28, -24, 58, + -33, -3, 24, 1, 31, 1, -82, 97, -19, 40, -78, -18, 8, -26, 28, -13, + -47, 98, -12, 42, -36, -25, 98, 42, 48, 46, -41, -8, 60, -52, -66, 71, + 1, -90, -2, 99, 9, -27, 93, 73, -92, -98, -16, -75, -29, -53, 93, 97, + 55, 16, 53, 52, -89, 32, -53, -64, -57, -86, -58, 75, -14, -31, -55, 7, + 22, 70, -80, 50, 6, -21, -54, 16, 23, 42, -76, -65, -50, -86, -33, -80, + 40, -82, -67, -23, 24, 95, 16, -49, -74, 56, 55, -66, 35, 43, 5, 63, + 57, 45, -99, 76, -24, 31, -11, -12, -7, -94, -6, 7, -27, -62, 40, 58, + -30, 90, -45, 69, 88, -59, 83, -65, -17, 95, 17, -94, -47, 28, 71, 19, + 72, 5, -54, 22, 40, 18, 49, 17, -80, 36, -75, -92, -86, -91, 21, 20, + 24, 69, -29, 51, -52, 53, -10, -64, -33, -51, 51, -82, -1, -41, 66, -12, + 45, 46, 63, 82, -32, 74, -52, 7, 60, 18, 48, 49, 75, -64, 42, 62, + 98, 59, -90, 37, -23, -24, 84, 52, -99, -37, -66, -94, -16, 45, -34, -59, + 17, 24, 66, 68, -77, 87, -55, -82, -33, 1, -59, -69, 66, 8, -65, -4, + 63, 30, -28, 35, -4, 22, -31, 68, 95, -88, -52, -41, 29, 87, -96, 71, + -74, 59, 48, -93, 99, -51, -34, -57, 24, -43, -98, -15, -31, 29, 71, -99, + 1, -66, -3, -71, 15, -93, 49, -57, 85, -65, 96, -24, -70, -54, -46, 5, + 89, 57, 20, -46, 43, -89, 44, 43, 39, 35, -98, 43, -71, -78, -9, 14, + 3, 17, -96, 3, 7, -36, 20, -71, 48, -17, 97, 49, 99, -69, 60, -85, + -63, 41, -65, 50, 13, -35, 4, -16, -99, 92, -26, -34, 79, 89, 50, 53, + 95, -46, -95, 62, 49, 70, 42, -15, -41, 0, -89, -52, -59, 65, -19, 72, + -66, 57, 57, 33, -34, -21, -94, 79, -87, -28, 42, 48, 62, -35, -10, 29, + 62, -81, -91, -76, 93, -76, -47, -68, 63, -6, -87, -36, 9, -21, -47, 24, + 79, 30, 68, 33, -76, 40, 16, 35, 48, 95, -76, -22, -33, 72, 87, -89, + -69, 27, -5, -9, 40, -70, -23, -81, 45, -41, 90, 33, 59, -8, -58, 29, + -41, 50, -70, -96, 61, -91, -12, 29, 25, -90, 77, 58, -45, -18, 36, -2, + 48, -28, 55, 89, 14, -94, -21, 43, -4, -97, -97, 2, -61, -84, 0, 2, + 23, -49, -49, 63, 23, 65, 77, -38, 15, 90, -39, -94, 54, -9, -87, 98, + -15, 32, 82, 29, -39, 48, -22, 48, 37, -43, 19, 66, 32, -36, -88, 80, + -19, 72, 59, 63, 87, 38, 90, 48, -49, 82, 16, -75, 95, -67, -49, 11, + -19, 79, 56, 65, -59, -80, -59, 40, 12, 4, 91, -84, 19, 36, 68, 28, + 29, -41, -53, -72, -4, -91, -43, 87, -80, -73, -67, -93, 86, 99, 76, -22, + 44, 87, -14, -13, 72, -32, 59, 71, 92, -14, 0, -93, -24, 99, -14, -58, + 28, 8, 78, 51, 51, -84, 32, 47, -80, -43, 58, 9, 38, 79, 42, 34, + -73, -50, 85, -96, -96, -7, -5, -93, -37, 37, -63, 10, 11, -20, 35, 3, + 52, -28, 93, 77, 94, -52, -65, 25, -33, -23, 4, 68, 21, 18, 89, 13, + -91, 48, 63, 39, -10, -99, 17, -88, 80, 59, -6, 43, -69, -69, 23, -90, + -91, -86, -84, 67, 45, 82, 52, -42, -23, 23, 13, -79, 57, 71, -69, 78, + 73, 12, 19, 48, 21, -69, 18, -62, -89, 74, -83, 48, 40, 48, 25, 35, + -55, 3, 34, 50, -15, 49, -12, -79, -9, -72, -88, -99, -72, 79, -74, -51, + -6, 50, -13, -57, -79, 70, 64, 88, -4, 83, -98, 90, 17, -82, -78, 56, + -71, -62, 7, 10, -76, 27, 10, 35, 5, 85, 43, -29, 28, 7, -96, -98, + 92, 31, 3, -37, 5, -3, -91, 48, 27, -55, 54, -35, 68, 11, -88, 36, + 63, -15, -11, -43, -47, -89, -87, 20, 61, 80, 20, -81, -26, 43, -31, -45, + 0, 85, 31, 10, 24, 31, -81, 37, -77, 53, -64, -47, 2, 48, -63, 73, + -97, 87, -44, 72, -81, -84, 18, -30, -97, -87, -98, -21, 85, -46, 31, 19, + -77, 61, 57, -19, 46, 58, 87, -41, 46, -52, -38, 9, 49, 51, 8, -25, + -71, -43, 16, 58, -1, 91, -82, 75, 72, -7, 82, 12, 82, 45, -38, 15, + 12, -54, -44, 17, -56, 60, 44, 97, -97, 47, -54, 42, 97, -54, -23, 23, + -26, 2, 36, -11, 43, -32, -40, 37, -51, -98, 6, 12, -7, 17, 97, -20, + 8, -50, 46, 69, -66, -55, 53, -32, -8, -37, 64, -52, 48, -27, 91, -96, + -1, -31, -11, -97, 25, -62, 62, 28, -77, -44, -1, -31, -60, 81, 73, -41, + 7, 87, -44, -70, -17, 51, -65, 57, 51, 7, 5, 57, -15, 34, 62, 82, + -87, 19, 37, -21, 43, 81, 5, -31, -57, -28, -56, 83, 6, -17, -19, 6, + 84, -80, 95, 77, 60, -94, -31, 26, 42, 59, 47, 42, 75, 98, 48, 69, + 53, 19, 27, 41, -44, 64, -43, -80, -19, -32, 30, -14, 64, -52, 46, 14, + -12, -16, 27, 39, 88, 68, 60, 77, 16, 95, 26, -30, -15, -68, -20, 51, + -40, -40, -3, -73, 85, -58, 68, 66, -15, 76, -84, -3, 97, 97, -3, -36, + -27, -16, -96, -99, -79, 11, -44, -35, -27, -3, 43, 11, 78, 24, 66, -15, + -25, -13, 67, -63, 87, -42, 37, -2, 91, -38, -52, 91, -69, 34, -80, -81, + 53, 26, -33, 9, 15, 42, 19, -22, 15, -9, -87, 54, 24, 67, -43, -73, + -75, 12, 90, -35, 52, 30, 99, -57, -4, -59, -95, 66, 79, -78, -12, 22, + 77, 70, 46, 96, -50, -90, -89, -27, -93, 51, -60, 63, -16, -29, -46, 36, + -54, 42, 75, 3, -59, -41, 97, -89, 29, -94, -21, 30, 8, -98, -74, 13, + 6, -61, 60, 28, -93, 0, 59, 23, 75, 6, 0, -97, 97, 9, -82, -69, + 36, -89, 38, -55, 18, -85, 28, -38, 8, 18, -80, 57, -88, 61, -97, 55, + 1, -42, 27, -74, -32, -76, 30, -4, 95, -76, 47, 61, 53, 25, 32, 88, + -92, 15, 77, 33, -41, 71, 20, 38, 93, 74, 7, 7, 30, -25, -71, -80, + 55, 89, 60, 62, 79, -67, -60, -50, 74, -19, -43, -26, 84, -78, 75, 27, + 49, -84, -32, 7, 38, -7, -86, -10, 62, -68, 24, -85, 61, -35, 67, 55, + -54, 66, 97, 47, 23, -100, 99, -14, 27, 90, -34, -97, 85, 52, -54, -63, + 42, 90, -80, -44, 16, 80, -55, 41, 44, -62, 36, -2, -24, 14, 33, 50, + 86, -10, -3, 19, -55, -63, 41, -1, -29, 89, -66, 46, -71, -23, -17, 12, + -76, 67, -13, -87, 53, 1, 49, 75, -18, 9, 78, 30, 69, -82, -73, 72, + -30, -38, -28, 23, 18, 20, -82, 74, 41, 19, 65, 80, -40, 63, 67, -71, + -92, -37, 60, 19, -49, -12, 41, -48, 86, -23, -60, 95, -33, -93, -99, 4, + 24, 9, -15, 68, 19, 26, 82, -85, -39, 12, 75, 57, -89, -66, -90, -37, + -51, 48, -32, -53, -81, 0, 94, 71, 60, 88, 36, -33, -66, -36, -93, -56, + -14, 74, 67, -25, 81, -25, 73, -54, -86, 4, 46, -75, -49, -8, 49, -80, + -50, -63, -81, 56, 78, 31, -46, 98, -9, -73, -97, 49, -25, 72, -34, -15, + 93, 61, -10, 82, 4, -100, -29, -14, 65, -95, 52, 44, -40, -89, -95, -74, + 35, 23, 5, 92, 84, -8, 8, -2, 67, 41, -66, 75, -17, -45, 76, 40, + 88, -52, -55, 12, 89, 35, -58, 47, 23, -4, -11, -62, -16, 76, 91, -39, + 5, 6, -47, 25, -92, -13, -64, -52, -60, -84, -13, -80, -50, -42, 66, 82, + -60, -62, 40, 91, -90, 46, 38, -24, 26, -55, -56, -100, 67, 7, -82, -25, + -29, 99, 88, 8, -89, 88, 60, -56, -11, 2, 96, -93, 52, -10, 73, -14, + 57, 74, 76, 74, -17, -19, 45, 93, -33, 60, 26, 63, -19, 46, 55, 67, + -17, 12, -25, 21, 42, -60, 66, -99, 90, -72, 4, 93, -50, -26, -91, 41, + -50, 17, 20, -62, -73, -98, -17, 97, 54, 16, -99, -8, 65, 30, 43, -7, + -61, -99, -20, 68, -89, 62, -36, -46, -56, 0, -39, -52, 26, 52, -78, 51, + -63, -73, -80, 81, 6, 45, 63, -2, 72, 18, -28, 80, 21, -71, 87, 7, + -3, -69, 94, 94, 55, 71, -22, -60, -38, -78, 3, -26, -53, 59, -44, 17, + -1, 23, 60, 51, 66, -67, 83, 55, 64, -94, 95, 64, -45, -36, 92, -54, + 47, -82, 56, 87, 45, 62, 70, 25, 38, -52, 12, -63, -37, -32, 35, 61, + 75, 15, 83, -36, -70, 31, 12, 8, 77, -32, 57, 19, -13, 86, -34, -26, + -56, -97, 53, 54, 73, 6, 91, -50, -51, -95, -42, 24, 42, -71, 69, 21, + 24, -64, 69, 23, 0, -66, -14, 33, 95, -83, 12, -72, -24, -42, 74, 25, + 44, 8, -79, -33, -32, 64, -52, 40, 61, 8, -15, -48, 22, -14, 78, 30, + -59, -3, 59, 8, -50, -16, 73, 26, -80, -57, 38, 2, -33, 28, -9, 90, + -77, 24, 40, 10, -3, 38, 78, -84, -2, -19, 3, 51, -82, 64, -16, -93, + -73, -96, 44, 0, 63, 17, 46, -37, -10, 16, -3, -90, -47, 72, -54, 39, + 48, 38, -96, 83, -81, 68, -55, -89, -31, -84, 14, 46, 53, -53, 7, -49, + -5, -31, 82, 1, -26, 60, 89, 50, 42, 54, 91, 78, 80, -19, -9, 78, + 74, -97, 34, 8, 62, -53, -28, -29, 15, 72, -20, -20, 12, -5, -29, 46, + -38, -63, -34, 86, -32, -88, 74, 88, -70, 13, -79, -8, 70, -34, -47, 37, + -59, -37, 25, -80, 93, 68, 51, -66, 83, -71, -44, -54, 62, -82, 13, -93, + -75, 12, -30, -71, 34, -34, -36, -65, -76, -64, -100, 42, -64, 40, 35, 32, + 87, 27, 26, -100, 63, -66, 93, 90, 3, -78, -92, -67, 73, 27, -97, 57, + 1, -26, -2, -29, -98, 41, 87, 72, -67, -56, -40, -18, 22, 1, -85, -35, + 67, 44, 18, 61, 23, 43, 1, 87, -35, 93, -88, 83, 10, 58, -67, -62, + 94, 35, 19, 25, -46, -74, 17, -48, -62, -36, 44, 57, 80, -88, -80, 46, + -48, -82, -44, -76, -70, -20, 82, -78, -69, 89, -27, -91, 99, -47, -49, 47, + 59, 13, -87, 77, 55, 9, -48, 51, -22, -79, -71, 90, -97, -42, -93, -89, + 18, 58, -67, -19, -3, 26, -26, 4, -52, -22, 84, -8, 53, -83, -5, -63 +}; + +static uint32_t offset_dram[1024] __attribute__((section(".data"))) = { + 1024, 1408, 1472, 3136, 448, 2560, 320, 256, 2752, 3584, 4032, 1792, 1728, 640, 3584, 0, + 2688, 1152, 512, 1664, 2112, 2752, 192, 2240, 1088, 896, 3200, 768, 1152, 3072, 320, 3008, + 3520, 704, 3200, 2304, 1024, 3648, 1408, 1600, 1792, 2048, 3776, 1664, 1536, 3520, 3456, 576, + 2944, 1408, 1280, 1280, 2688, 768, 2304, 2048, 1472, 2752, 640, 1024, 192, 2944, 1536, 1856, + 512, 1280, 1216, 1152, 1152, 704, 2304, 2624, 1024, 2304, 3392, 3008, 2432, 1728, 4032, 2560, + 2176, 2944, 3008, 1088, 4032, 832, 2176, 2752, 2560, 3776, 384, 2560, 768, 1088, 2304, 768, + 2240, 1920, 64, 960, 512, 512, 2368, 640, 3392, 3520, 1344, 2048, 2240, 2816, 1408, 2816, + 2816, 2112, 512, 1472, 448, 384, 2240, 1088, 128, 1728, 320, 2752, 2752, 1280, 1728, 2496, + 3392, 3392, 0, 2240, 1280, 3776, 2688, 2176, 768, 3584, 3072, 3392, 3392, 576, 3328, 2752, + 2944, 3328, 576, 2880, 768, 2304, 2368, 1472, 64, 1344, 1024, 2240, 832, 576, 2176, 1536, + 128, 2944, 2368, 1024, 704, 3392, 3264, 1472, 1792, 2688, 1536, 2240, 3648, 3776, 1600, 2368, + 0, 1216, 3456, 1728, 960, 2944, 1024, 2496, 1728, 3648, 704, 1664, 3392, 3328, 2624, 3712, + 2944, 1856, 2560, 2112, 2304, 320, 384, 1984, 1984, 3328, 1536, 2048, 3456, 3648, 384, 3264, + 3584, 3904, 704, 960, 960, 2688, 3328, 2240, 768, 384, 2240, 2944, 3200, 512, 1984, 640, + 448, 2048, 1408, 1536, 3520, 448, 2816, 1344, 3712, 192, 2496, 128, 1792, 2560, 1792, 2112, + 2112, 1728, 768, 896, 3776, 1408, 2752, 1600, 192, 1600, 2496, 512, 3456, 256, 3520, 1984, + 1216, 3136, 640, 896, 1984, 1728, 1344, 2560, 1024, 3712, 1600, 2752, 384, 2624, 3584, 1600, + 1536, 576, 3712, 2816, 1152, 640, 768, 3328, 448, 2496, 192, 2944, 1920, 1728, 640, 2688, + 0, 3072, 2944, 3264, 2368, 1472, 3392, 3776, 3584, 2944, 1920, 2560, 2880, 3136, 1600, 3072, + 3136, 256, 1216, 3264, 2560, 640, 3520, 3392, 2048, 3136, 2368, 1664, 2112, 896, 1984, 3840, + 3648, 1280, 2880, 256, 320, 2368, 3904, 1344, 3904, 1088, 832, 576, 2880, 768, 1920, 1856, + 128, 1856, 320, 1792, 448, 2752, 2816, 2688, 2752, 2048, 1024, 3008, 640, 768, 3136, 640, + 1920, 1728, 2752, 320, 2560, 960, 192, 2176, 3776, 3008, 3520, 4032, 3712, 1600, 3904, 3584, + 704, 640, 2304, 1536, 1088, 1152, 1408, 3456, 2368, 3392, 1088, 1600, 320, 192, 2752, 3200, + 1792, 3712, 3840, 2112, 1024, 2688, 2688, 3328, 960, 64, 2112, 3456, 448, 768, 2624, 448, + 1856, 1984, 2240, 2944, 1856, 0, 2688, 2432, 1024, 3712, 1216, 1472, 3136, 1344, 1792, 1344, + 2752, 512, 2304, 576, 2240, 2304, 1728, 4032, 1728, 2624, 3968, 2688, 1280, 1472, 2624, 3776, + 2496, 64, 3456, 448, 3008, 1664, 1344, 2304, 1664, 3584, 1792, 128, 2752, 768, 2816, 576, + 2240, 512, 3392, 3520, 3456, 1664, 2112, 3200, 0, 1984, 1536, 1792, 384, 3072, 128, 1216, + 1216, 3328, 1856, 2176, 2176, 3072, 2432, 1664, 1600, 832, 1024, 1088, 2816, 3264, 1792, 1216, + 3328, 1664, 1088, 192, 0, 3136, 0, 3648, 1600, 2816, 1216, 3264, 1408, 576, 3712, 640, + 2560, 2560, 256, 3648, 0, 1024, 1344, 2112, 3520, 2816, 2560, 1792, 192, 256, 4032, 640, + 3520, 3200, 1280, 2176, 704, 1600, 3200, 1152, 2816, 3776, 384, 2944, 3456, 384, 1600, 2496, + 256, 1280, 832, 192, 1408, 2624, 1216, 896, 3136, 3264, 2816, 1792, 448, 2560, 3008, 1152, + 3008, 2432, 2880, 1728, 2880, 3584, 1664, 192, 3648, 896, 2176, 2304, 0, 128, 2944, 1472, + 1664, 2496, 3968, 3840, 2240, 2112, 1792, 1472, 1472, 2624, 192, 128, 2112, 3584, 3072, 1408, + 3328, 3520, 3264, 1408, 2816, 256, 1920, 3904, 3328, 3904, 2560, 3584, 3840, 3584, 3776, 3392, + 320, 2304, 3584, 576, 2496, 384, 2432, 2368, 1280, 3840, 1216, 128, 4032, 3200, 1344, 2944, + 192, 1728, 704, 2368, 3200, 3968, 2240, 2944, 2560, 256, 768, 1408, 3648, 3136, 1856, 3904, + 2304, 1536, 3008, 1664, 3200, 832, 3584, 3904, 3008, 3712, 2944, 3392, 2240, 2432, 960, 3520, + 1344, 1280, 3392, 1280, 448, 3072, 960, 3712, 3456, 1920, 2624, 576, 1984, 3712, 2112, 3968, + 3904, 512, 1024, 640, 1984, 576, 1344, 3584, 1024, 192, 3008, 3264, 2944, 1536, 3712, 1280, + 2304, 2368, 3072, 0, 3968, 1536, 3200, 2240, 1024, 2560, 3968, 1472, 64, 2496, 896, 1984, + 2432, 3712, 320, 0, 2944, 1408, 3648, 2432, 3840, 3904, 3584, 1024, 2112, 3712, 2560, 1792, + 512, 768, 320, 320, 1216, 3264, 3328, 128, 1024, 3968, 3072, 704, 3008, 2944, 2816, 3456, + 4032, 0, 1024, 128, 1600, 896, 2240, 1984, 1984, 4032, 192, 1088, 3072, 1152, 2688, 3264, + 2176, 2496, 832, 3648, 2048, 832, 2112, 3840, 3456, 192, 1472, 3712, 1792, 1152, 320, 3584, + 2048, 3072, 2496, 1152, 3328, 0, 256, 1664, 960, 320, 2496, 768, 3008, 4032, 3904, 2176, + 1152, 704, 960, 3712, 2688, 576, 1536, 1216, 1920, 192, 2752, 1408, 64, 3520, 1344, 3200, + 3584, 2304, 448, 384, 832, 3520, 3456, 128, 3136, 960, 3456, 1152, 3200, 1088, 3264, 960, + 448, 3776, 2048, 3072, 704, 3904, 3328, 640, 256, 1664, 704, 1024, 320, 2048, 256, 3584, + 1728, 832, 256, 3392, 3520, 3712, 2048, 3904, 3136, 3008, 2560, 1152, 1344, 256, 3712, 64, + 1856, 2944, 2432, 192, 2048, 768, 1216, 3392, 384, 896, 1408, 832, 1408, 1152, 4032, 3392, + 1792, 3520, 1216, 1792, 1472, 1088, 3968, 576, 2560, 3584, 448, 1600, 3904, 1792, 576, 1024, + 64, 320, 3648, 256, 1728, 3904, 2560, 256, 1088, 1600, 1216, 2560, 3072, 1664, 3072, 704, + 384, 3712, 128, 576, 2112, 1536, 2176, 960, 64, 3776, 1472, 576, 2368, 1344, 1600, 2688, + 3456, 2432, 1664, 1792, 2944, 2368, 1216, 1664, 2560, 3392, 1216, 2752, 1152, 1216, 2176, 2944, + 64, 1856, 3328, 2112, 1344, 128, 320, 896, 1664, 576, 512, 2048, 1984, 3008, 320, 2560, + 3712, 2240, 3712, 3776, 64, 1792, 192, 0, 1152, 1600, 1536, 576, 3328, 896, 3712, 1344, + 3200, 3008, 1280, 3712, 3520, 1344, 2240, 2112, 1856, 1536, 2816, 1728, 0, 3072, 2816, 192, + 384, 3072, 512, 2240, 1152, 0, 576, 2240, 512, 2816, 2880, 2624, 1984, 1984, 320, 3008, + 2368, 1024, 576, 3136, 1664, 3392, 3136, 2176, 3264, 3712, 1280, 3968, 1536, 1728, 2944, 896, + 192, 1792, 3712, 512, 1664, 1600, 1152, 3520, 1728, 1024, 2688, 3584, 2560, 2624, 3136, 1216, + 2816, 2496, 2432, 3072, 3328, 1408, 2944, 1984, 3264, 2944, 4032, 3968, 2048, 3584, 2880, 256 +}; + diff --git a/software/tests/bandwidth/main.c b/software/tests/bandwidth/main.c new file mode 100644 index 0000000..a817c18 --- /dev/null +++ b/software/tests/bandwidth/main.c @@ -0,0 +1,157 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Diyou Shen + +#include +#include +#include +#include + +#include "data/data.h" + +// #define DEBUG + +// Random-load bandwidth benchmark — measures L1-to-core interconnect bandwidth. +// +// Phase 1 (warmup): all cores together stream through all of data_dram so that +// every cache line is resident in the L1 before measurement begins. +// Requires M * sizeof(int) <= total L1 cache capacity. +// +// Phase 2 (measurement): each core issues random vector loads drawn from a +// pre-generated offset table. Because all data is already cached, this +// measures the L1 hit bandwidth (local + remote tile xbar), not DRAM refill +// latency. No correctness check is performed — only the cycle count matters. + +int main() { + const uint32_t num_cores = snrt_cluster_core_num(); + const uint32_t cid = snrt_cluster_core_idx(); + const uint32_t c_per_tile = 4; + const uint32_t cid_tile = cid % c_per_tile; + + // Spatz vector config: lmul=4, element width 32 b, VLEN=512 b + const uint32_t lmul = 4; + const uint32_t vlen_bits = 512; + const uint32_t elem_bits = 32; + // Elements transferred by one vle32.v with lmul=4 + const uint32_t v_len = lmul * vlen_bits / elem_bits; + + const uint32_t measure_iterations = R; + + // offset = 6 = log2(64 B cacheline) — the hardware minimum. + // + // With this offset and 256-byte-aligned accesses (step = v_len = 64 elems): + // addr[7:6] = bank within tile (changes at +64, +128, +192 → banks 0..3) + // addr[9:8] = tile ID (unchanged across the 256-byte span) + // + // Every vle32.v load (256 bytes, 4 cachelines) therefore maps entirely to + // ONE tile while distributing its 4 cachelines across the 4 banks of that + // tile. The random offset in offset_dram picks which tile is targeted + // (local or remote), so ~75 % of loads hit a remote tile and generate + // traffic on the inter-tile xbar. + const uint32_t scramble_bits = 6; + + if (cid == 0) { + l1d_xbar_config(scramble_bits); + // Fully shared + l1d_part(4); +#ifdef DEBUG + printf("scramble_bits=%u v_len=%u\n", scramble_bits, v_len); +#endif + } + + snrt_cluster_hw_barrier(); + + // ----------------------------------------------------------------------- + // Phase 1: warmup — fill L1 cache with all of data_dram. + // Each core streams through its own slice (M / num_cores elements) using + // the widest LMUL so that every tile's banks are populated in parallel. + // ----------------------------------------------------------------------- + // const uint32_t elems_per_core = M / num_cores; + const uint32_t elems_per_core = M; + // const int *wp = data_dram + cid * elems_per_core; + const int *wp = data_dram + cid_tile * elems_per_core; + uint32_t avl = elems_per_core; + uint32_t wvl; + do { + asm volatile("vsetvli %0, %1, e32, m8, ta, ma" : "=r"(wvl) : "r"(avl)); + asm volatile("vle32.v v0, (%0)" :: "r"(wp)); + wp += wvl; + avl -= wvl; + } while (avl > 0); + + // Barrier: every tile's banks must be populated before measurement starts. + snrt_cluster_hw_barrier(); + + // ----------------------------------------------------------------------- + // Phase 2: measurement — random loads from the now-hot cache. + // ----------------------------------------------------------------------- + + // Per-core pointer into the offset table. + // Layout: interleaved by core — + // [core0_round0, core1_round0, ..., coreN_round0, core0_round1, ...] + const uint32_t *offset_p = offset_dram + cid; + + const int *data = data_dram; + const int *addr1, *addr2, *addr3, *addr4; + + uint32_t vl; + asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(v_len)); + + uint32_t timer = 0; + + if (cid == 0) { + start_kernel(); + timer = benchmark_get_cycle(); + } + + // Four loads per inner iteration to overlap address computation with loads. + for (uint32_t i = 0; i < measure_iterations / 4; i++) { + addr1 = data + *offset_p; offset_p += num_cores; + addr2 = data + *offset_p; offset_p += num_cores; + asm volatile("vle32.v v0, (%0)" :: "r"(addr1)); + asm volatile("vle32.v v4, (%0)" :: "r"(addr2)); + addr3 = data + *offset_p; offset_p += num_cores; + addr4 = data + *offset_p; offset_p += num_cores; + asm volatile("vle32.v v8, (%0)" :: "r"(addr3)); + asm volatile("vle32.v v12, (%0)" :: "r"(addr4)); + } + + snrt_cluster_hw_barrier(); + + if (cid == 0) { + timer = benchmark_get_cycle() - timer; + stop_kernel(); + + // elements loaded by one core / timer + uint32_t performance = measure_iterations * v_len * 1000 / timer; + // 1000‰ = one vector load per cycle (peak throughput) + uint32_t utilization = performance / v_len; + + printf("\n----- random-load bw: %u iters x %u elems -----\n", + measure_iterations, v_len); + printf("Total cycles: %u, avg per load: %u\n", + timer, timer / measure_iterations); + printf("Performance: %u elems/1000cyc (%u%%o utilization)\n", + performance, utilization); + + write_cyc(timer); + } + + snrt_cluster_hw_barrier(); + + return 0; +} diff --git a/software/tests/bandwidth/script/bw.json b/software/tests/bandwidth/script/bw.json new file mode 100644 index 0000000..7abcd4d --- /dev/null +++ b/software/tests/bandwidth/script/bw.json @@ -0,0 +1,17 @@ +// Parameters for CachePool Random-Load Bandwidth Benchmark +// +// M : elements in data_dram (4096 int32 = 16 KB; exceeds default L1 to +// force DRAM refills on random accesses) +// round: measurement rounds per core; must be divisible by 4 +// prec : element width in bits (32 = int) +// core : active cores (matches default cluster: 4 tiles × 4 cores = 16) +// step : load granularity in elements = v_len for lmul=4, VLEN=512 +// (4 × 512/32 = 64 elements per vle32.v) + +{ + M: 4096, + round: 64, + prec: 32, + core: 16, + step: 64 +} diff --git a/software/tests/bandwidth/script/gen_data.py b/software/tests/bandwidth/script/gen_data.py new file mode 100644 index 0000000..38408d8 --- /dev/null +++ b/software/tests/bandwidth/script/gen_data.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Diyou Shen +# +# Generate data/data.h for the CachePool random-load bandwidth benchmark. +# +# Layout in the generated header: +# const uint32_t M = ; +# const uint32_t R = ; +# static int data_dram[M] — random payload in DRAM +# static uint32_t offset_dram[R * cores] — interleaved per-core offsets +# +# Each offset is a v_len-aligned element index into data_dram (step = v_len). +# Offsets span the full data_dram range so that repeated accesses across all +# rounds cover more than the L1 cache capacity and exercise DRAM bandwidth. + +import numpy as np +import argparse +import pathlib +import hjson + +np.random.seed(42) + + +def array_to_cstr(a): + """Format a numpy integer array as a C initialiser list.""" + out = '{\n' + values_per_line = 16 + flat = a.flatten() + for i in range(0, len(flat), values_per_line): + chunk = flat[i:i + values_per_line] + line = ', '.join(str(v) for v in chunk) + if i + values_per_line < len(flat): + out += ' ' + line + ',\n' + else: + out += ' ' + line + '\n' + out += '}' + return out + + +def rand_data_generator(shape, prec): + """Return a random integer numpy array of the requested precision.""" + dtype_map = {64: np.int64, 32: np.int32, 16: np.int16, 8: np.int8} + dtype = dtype_map[prec] + return np.random.randint(-100, 100, size=shape, dtype=dtype) + + +def rand_offset_generator(num_entries, data_elems, step): + """Return v_len-aligned random element offsets that span data_dram. + + Each offset satisfies: + offset + step <= data_elems (in-bounds for a v_len-wide load) + Offsets are expressed in elements (not bytes). + + Parameters + ---------- + num_entries : int + Total number of offset entries (cores * rounds). + data_elems : int + Size of data_dram in elements. + step : int + Load granularity in elements (= v_len, e.g. 64 for lmul=4 + VLEN=512). + """ + max_idx = data_elems // step - 1 # last valid aligned block index + if max_idx < 0: + raise ValueError(f"data_elems ({data_elems}) < step ({step})") + indices = np.random.randint(0, max_idx + 1, size=num_entries, dtype=np.uint32) + return indices * step + + +def emit_header(data_arr, offset_arr, M, R, cores, prec): + ctypes = {64: 'int64_t', 32: 'int', 16: 'int16_t', 8: 'int8_t'} + dtype = ctypes[prec] + + offset_size = R * cores + + s = '// Copyright 2022 ETH Zurich and University of Bologna.\n' + s += '// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n' + s += '// SPDX-License-Identifier: Apache-2.0\n' + s += '// This file was generated automatically by script/gen_data.py.\n\n' + s += '#include \n\n' + s += f'const uint32_t M = {M};\n' + s += f'const uint32_t R = {R};\n\n' + # data_dram: payload array accessed by random vector loads + s += (f'static {dtype} data_dram[{M}]' + f' __attribute__((section(".data"))) = ' + + array_to_cstr(data_arr) + ';\n\n') + # offset_dram: interleaved per-core random element offsets + # Layout: [core0_r0, core1_r0, ..., coreN_r0, core0_r1, ...] + s += (f'static uint32_t offset_dram[{offset_size}]' + f' __attribute__((section(".data"))) = ' + + array_to_cstr(offset_arr) + ';\n\n') + return s + + +def main(): + parser = argparse.ArgumentParser( + description='Generate data.h for the CachePool bandwidth benchmark') + parser.add_argument('-c', '--cfg', type=pathlib.Path, required=True, + help='Path to parameter JSON (e.g. script/bw.json)') + args = parser.parse_args() + + with args.cfg.open() as f: + p = hjson.loads(f.read()) + + M = p['M'] # elements in data_dram + R = p['round'] # measurement rounds per core + prec = p['prec'] # element width in bits (32) + cores = p['core'] # number of active cores + step = p['step'] # load granularity in elements (= v_len) + + data_arr = rand_data_generator((M,), prec) + offset_arr = rand_offset_generator(R * cores, M, step) + + out_path = pathlib.Path(__file__).parent.parent / 'data' / 'data.h' + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open('w') as f: + f.write(emit_header(data_arr, offset_arr, M, R, cores, prec)) + + print(f'Generated {out_path} (M={M}, R={R}, cores={cores}, step={step})') + + +if __name__ == '__main__': + main() From beaf3c444c4d1bbec4b7204691112213c208fbe8 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 22 May 2026 16:31:16 +0200 Subject: [PATCH 24/37] [BootROM] Remove git tracing unnecessary bootrom generated files. --- .gitignore | 6 ++ hardware/bootrom/bootdata.cc | 19 ---- hardware/bootrom/bootdata_bootrom.cc | 28 ------ hardware/bootrom/bootrom.bin | Bin 136 -> 0 bytes hardware/bootrom/bootrom.dump | 127 --------------------------- hardware/bootrom/bootrom.elf | Bin 5248 -> 0 bytes hardware/bootrom/bootrom.sv | 44 ---------- 7 files changed, 6 insertions(+), 218 deletions(-) delete mode 100644 hardware/bootrom/bootdata.cc delete mode 100644 hardware/bootrom/bootdata_bootrom.cc delete mode 100755 hardware/bootrom/bootrom.bin delete mode 100644 hardware/bootrom/bootrom.dump delete mode 100755 hardware/bootrom/bootrom.elf delete mode 100644 hardware/bootrom/bootrom.sv diff --git a/.gitignore b/.gitignore index 8265b48..5a930ee 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,9 @@ hardware/deps/* *.tdb util/lint/sg_projects util/lint/tmp +hardware/bootrom/bootdata.cc +hardware/bootrom/bootdata_bootrom.cc +hardware/bootrom/bootrom.bin +hardware/bootrom/bootrom.dump +hardware/bootrom/bootrom.elf +hardware/bootrom/bootrom.sv diff --git a/hardware/bootrom/bootdata.cc b/hardware/bootrom/bootdata.cc deleted file mode 100644 index 55a40ec..0000000 --- a/hardware/bootrom/bootdata.cc +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -#include - -namespace sim { - -const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 256, - .hartid_base = 0, - .tcdm_start = 0xbffff800, - .tcdm_size = 0x800, - .tcdm_offset = 0x0, - .global_mem_start = 0x80000000, - .global_mem_end = 0xa0000000, - .tile_count = 64}; - -} // namespace sim diff --git a/hardware/bootrom/bootdata_bootrom.cc b/hardware/bootrom/bootdata_bootrom.cc deleted file mode 100644 index db006ef..0000000 --- a/hardware/bootrom/bootdata_bootrom.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -#include - -// The boot data generated along with the system RTL. -struct BootData { - uint32_t boot_addr; - uint32_t core_count; - uint32_t hartid_base; - uint32_t tcdm_start; - uint32_t tcdm_size; - uint32_t tcdm_offset; - uint64_t global_mem_start; - uint64_t global_mem_end; - uint32_t tile_count; -}; - -extern "C" const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 256, - .hartid_base = 0, - .tcdm_start = 0xbffff800, - .tcdm_size = 0x800, - .tcdm_offset = 0x0, - .global_mem_start = 0x80000000, - .global_mem_end = 0xa0000000, - .tile_count = 64}; diff --git a/hardware/bootrom/bootrom.bin b/hardware/bootrom/bootrom.bin deleted file mode 100755 index aee11d2c064f36b54f18744ef583a63c09c1d001..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 136 zcmWe: - 1000: 00000317 auipc t1,0x0 - 1004: 07832303 lw t1,120(t1) # 1078 <_GLOBAL_OFFSET_TABLE_+0x4> - 1008: 30531073 csrw mtvec,t1 - 100c: f1402573 csrr a0,mhartid - 1010: 00000597 auipc a1,0x0 - 1014: 06c5a583 lw a1,108(a1) # 107c <_GLOBAL_OFFSET_TABLE_+0x8> - 1018: 3047d073 csrwi mie,15 - 101c: 10500073 wfi - 1020: 00c5a383 lw t2,12(a1) - 1024: 0105ae03 lw t3,16(a1) - 1028: 01c383b3 add t2,t2,t3 - 102c: 02038393 addi t2,t2,32 - 1030: 0003a383 lw t2,0(t2) - 1034: 00038067 jr t2 - -00001038 : - 1038: 10500073 wfi - 103c: ffdff06f j 1038 - -Disassembly of section .rodata: - -00001040 : - 1040: 1000 .2byte 0x1000 - 1042: 0000 .2byte 0x0 - 1044: 0100 .2byte 0x100 - 1046: 0000 .2byte 0x0 - 1048: 0000 .2byte 0x0 - 104a: 0000 .2byte 0x0 - 104c: f800 .2byte 0xf800 - 104e: bfff .2byte 0xbfff - 1050: 0800 .2byte 0x800 - ... - 105a: 8000 .2byte 0x8000 - 105c: 0000 .2byte 0x0 - 105e: 0000 .2byte 0x0 - 1060: 0000 .2byte 0x0 - 1062: a000 .2byte 0xa000 - 1064: 0000 .2byte 0x0 - 1066: 0000 .2byte 0x0 - 1068: 0040 .2byte 0x40 - 106a: 0000 .2byte 0x0 - 106c: 0000 .2byte 0x0 - ... - -Disassembly of section .boot_section: - -00001070 : - 1070: 1038 .2byte 0x1038 - ... - -Disassembly of section .got: - -00001074 <_GLOBAL_OFFSET_TABLE_>: - 1074: 0000 .2byte 0x0 - 1076: 0000 .2byte 0x0 - 1078: 1038 .2byte 0x1038 - 107a: 0000 .2byte 0x0 - 107c: 1040 .2byte 0x1040 - ... - -Disassembly of section .got.plt: - -00001080 <.got.plt>: - 1080: ffff .2byte 0xffff - 1082: ffff .2byte 0xffff - 1084: 0000 .2byte 0x0 - ... - -Disassembly of section .riscv.attributes: - -00000000 <.riscv.attributes>: - 0: 4341 .2byte 0x4341 - 2: 0000 .2byte 0x0 - 4: 7200 .2byte 0x7200 - 6: 7369 .2byte 0x7369 - 8: 01007663 bgeu zero,a6,14 <_start-0xfec> - c: 0039 .2byte 0x39 - e: 0000 .2byte 0x0 - 10: 1004 .2byte 0x1004 - 12: 7205 .2byte 0x7205 - 14: 3376 .2byte 0x3376 - 16: 6932 .2byte 0x6932 - 18: 7032 .2byte 0x7032 - 1a: 5f31 .2byte 0x5f31 - 1c: 326d .2byte 0x326d - 1e: 3070 .2byte 0x3070 - 20: 615f 7032 5f31 .byte 0x5f, 0x61, 0x32, 0x70, 0x31, 0x5f - 26: 3266 .2byte 0x3266 - 28: 3270 .2byte 0x3270 - 2a: 7a5f 6369 7273 .byte 0x5f, 0x7a, 0x69, 0x63, 0x73, 0x72 - 30: 7032 .2byte 0x7032 - 32: 5f30 .2byte 0x5f30 - 34: 697a .2byte 0x697a - 36: 6566 .2byte 0x6566 - 38: 636e .2byte 0x636e - 3a: 6965 .2byte 0x6965 - 3c: 7032 .2byte 0x7032 - 3e: 0030 .2byte 0x30 - 40: 0108 .2byte 0x108 - 42: 0b0a .2byte 0xb0a - -Disassembly of section .comment: - -00000000 <.comment>: - 0: 3a434347 .4byte 0x3a434347 - 4: 2820 .2byte 0x2820 - 6: 736f7263 bgeu t5,s6,72a <_start-0x8d6> - a: 6f6f7473 csrrci s0,0x6f6,30 - e: 2d6c .2byte 0x2d6c - 10: 474e .2byte 0x474e - 12: 3120 .2byte 0x3120 - 14: 322e .2byte 0x322e - 16: 2e35 .2byte 0x2e35 - 18: 2e30 .2byte 0x2e30 - 1a: 3538 .2byte 0x3538 - 1c: 365f 6331 6334 .byte 0x5f, 0x36, 0x31, 0x63, 0x34, 0x63 - 22: 20296163 bltu s2,sp,224 <_start-0xddc> - 26: 2e39 .2byte 0x2e39 - 28: 2e35 .2byte 0x2e35 - 2a: 0030 .2byte 0x30 diff --git a/hardware/bootrom/bootrom.elf b/hardware/bootrom/bootrom.elf deleted file mode 100755 index 83b7f0bb367b6c3adf3549dd4ed0adcec21f2f9e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5248 zcmeHLyKWOf6uq0xiv$UW1|b?Ff|N)wS{nxvL}4d>G**lhqjW5r@j4jA+12hOO44A- zO?nDiq==S=51>GiC@A;?eu0Q4ow;Xs$BINlMKxEtb6;of%$}KM=WVT7w=9daOweyi zq8w3)lizNpl@jG?f|h8W=Ex=rPr0t8nmLnfF;~wgU|3JtKxbH!9IO$;fMLKeU>GnA z7zPXjh5^HXVZbn87%&VN2L2lZxA3(bzVKFGWeuw@J^tJ%8&Y?*b$ zf_!Czho~3JL$cP`oy<)|hmVTAVpwqd#n5p*c->-HbWeIRj*vR(b^U$mGsdA=Yi_== zvGFv&B%>gXRS@hw+-&3vqPQX)vAW_uF36IU-g16Ta0W~ME!p1e^aUf?A3!$0d_cl+ zok?R+;}|a=M1yh6Bwi7lrmsZkHy@V4s@Jz_ZMR)nZ`NFLW93DP)?2N1wbHH-iTJp$ydAbG zO6_ayDgQ{3hyrvbf!1#9%Vce_8*qsS5$*wl>mfX)qTbGd@?#P*==XVp__lwPJB-71 zO-=-Su$I|qNl%jKE+>G)$%7}&mB#67zMSE3BXB-Y-2+(kfIMM^4;;<~nw&Yu5X^x- zW7W6;W1xC1E{$ 1 ? $clog2(RomSize) : 1; - - const logic [RomSize-1:0][DataWidth-1:0] mem = { - 128'h00001040000010380000000000001038, - 128'h000000000000004000000000a0000000, - 128'h00000000800000000000000000000800, - 128'hbffff800000000000000010000001000, - 128'hffdff06f10500073000380670003a383, - 128'h0203839301c383b30105ae0300c5a383, - 128'h105000733047d07306c5a58300000597, - 128'hf1402573305310730783230300000317 - }; - - logic [AddrBits-1:0] addr_q; - - always_ff @(posedge clk_i) begin - if (req_i) begin - addr_q <= addr_i[AddrBits-1+4:4]; - end - end - - // this prevents spurious Xes from propagating into - // the speculative fetch stage of the core - assign rdata_o = (addr_q < RomSize) ? mem[addr_q] : '0; -endmodule From 0782e24071fd72c7fa0c1852fc96841195f61d0e Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 22 May 2026 16:36:46 +0200 Subject: [PATCH 25/37] [SRC] Add L2 ICache to the cluster. --- Bender.lock | 17 ++ Bender.yml | 12 +- Makefile | 14 +- hardware/src/axi_hier_interco.sv | 322 ++++++++++++++++++++ hardware/src/cachepool_group.sv | 121 +++++--- hardware/src/cachepool_group_noc_wrapper.sv | 4 +- hardware/src/cachepool_pkg.sv | 10 +- hardware/src/cachepool_tile.sv | 14 +- 8 files changed, 454 insertions(+), 60 deletions(-) create mode 100644 hardware/src/axi_hier_interco.sv diff --git a/Bender.lock b/Bender.lock index 4f60875..b547e3a 100644 --- a/Bender.lock +++ b/Bender.lock @@ -31,6 +31,16 @@ packages: Git: https://github.com/pulp-platform/axi_stream.git dependencies: - common_cells + cluster_icache: + revision: ce0ed94a5b95f5c76b9fa51940303fcce53f56e5 + version: null + source: + Git: https://github.com/pulp-platform/cluster_icache.git + dependencies: + - axi + - common_cells + - scm + - tech_cells_generic common_cells: revision: 9ca8a7655f741e7dd5736669a20a301325194c28 version: 1.39.0 @@ -132,6 +142,13 @@ packages: dependencies: - common_cells - tech_cells_generic + scm: + revision: 1976c7efb4979271eee2abe262fde0f9a20e2557 + version: 1.2.1 + source: + Git: https://github.com/pulp-platform/scm.git + dependencies: + - tech_cells_generic spatz: revision: ed25c78dd72d839db8141287f9516d78ee399b93 version: null diff --git a/Bender.yml b/Bender.yml index 9c64d0d..7873a05 100644 --- a/Bender.yml +++ b/Bender.yml @@ -10,14 +10,14 @@ dependencies: axi_riscv_atomics: { git: "https://github.com/pulp-platform/axi_riscv_atomics.git", version: 0.7.0 } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", version: 1.28.0 } FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 } - idma: { git: "https://github.com/pulp-platform/iDMA.git", version: 0.4.2 } register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: 0.3.8 } riscv-dbg: { git: "https://github.com/pulp-platform/riscv-dbg.git", version: 0.7.0 } tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.11 } Insitu-Cache: { git: "https://github.com/pulp-platform/Insitu-Cache.git", rev: zexin/cachepool_dev } - spatz: { git: "https://github.com/pulp-platform/spatz.git", rev: cachepool-32b } dram_rtl_sim: { git: "https://github.com/pulp-platform/dram_rtl_sim.git", rev: cachepool } floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: main } + cluster_icache: { git: "https://github.com/pulp-platform/cluster_icache.git", rev: main } + spatz: { git: "https://github.com/pulp-platform/spatz.git", rev: cachepool-32b } workspace: checkout_dir: "./hardware/deps" @@ -38,12 +38,14 @@ sources: - hardware/cachepool_peripheral/cachepool_peripheral.sv # Bootrom - hardware/bootrom/bootrom.sv - # Barrier - - hardware/src/cachepool_tile_barrier.sv - - hardware/src/cachepool_cluster_barrier.sv # Level 1 - hardware/src/cachepool_pkg.sv - hardware/src/cachepool_cc.sv + # Barrier + - hardware/src/cachepool_tile_barrier.sv + - hardware/src/cachepool_cluster_barrier.sv + # ICache + - hardware/src/axi_hier_interco.sv # Level 2 - hardware/src/cachepool_tile.sv # Level 3 diff --git a/Makefile b/Makefile index f11d183..cecd7a9 100644 --- a/Makefile +++ b/Makefile @@ -257,6 +257,7 @@ VLOG_DEFS = -DCACHEPOOL # Cluster configuration VLOG_DEFS += -DNUM_GROUPS=$(num_groups) +VLOG_DEFS += -DNUM_GROUPS_X=$(num_groups_x) VLOG_DEFS += -DNUM_TILES=$(num_tiles) VLOG_DEFS += -DNUM_CORES=$(num_cores) VLOG_DEFS += -DDATA_WIDTH=$(data_width) @@ -267,23 +268,20 @@ VLOG_DEFS += -DREFILL_DATA_WIDTH=$(refill_data_width) # L1 Data Cache VLOG_DEFS += -DL1D_CACHELINE_WIDTH=$(l1d_cacheline_width) -VLOG_DEFS += -DL1D_SIZE=$(l1d_size) -VLOG_DEFS += -DL1D_BANK_FACTOR=$(l1d_bank_factor) VLOG_DEFS += -DL1D_COAL_WINDOW=$(l1d_coal_window) VLOG_DEFS += -DL1D_NUM_WAY=$(l1d_num_way) -VLOG_DEFS += -DL1D_TILE_SIZE=$(l1d_tile_size) VLOG_DEFS += -DL1D_TAG_DATA_WIDTH=$(l1d_tag_data_width) VLOG_DEFS += -DL1D_NUM_BANKS=$(l1d_num_banks) VLOG_DEFS += -DL1D_DEPTH=$(l1d_depth) # CachePool CC / core cluster -VLOG_DEFS += -DSPATZ_FPU_EN=$(spatz_fpu_en) VLOG_DEFS += -DSPATZ_NUM_FPU=$(spatz_num_fpu) VLOG_DEFS += -DSPATZ_NUM_IPU=$(spatz_num_ipu) VLOG_DEFS += -DSPATZ_MAX_TRANS=$(spatz_max_trans) VLOG_DEFS += -DSNITCH_MAX_TRANS=$(snitch_max_trans) VLOG_DEFS += -DREMOTE_PORT_PER_CORE=$(num_remote_ports_per_tile) VLOG_DEFS += -DRG_PORT_PER_CORE=$(num_rg_ports_per_core) +VLOG_DEFS += -DNOC_PORT_PER_TILE=$(num_noc_ports_per_tile) # AXI configuration VLOG_DEFS += -DAXI_USER_WIDTH=$(axi_user_width) @@ -293,14 +291,12 @@ VLOG_DEFS += -DL2_CHANNEL=$(l2_channel) VLOG_DEFS += -DL2_BANK_WIDTH=$(l2_bank_width) VLOG_DEFS += -DL2_INTERLEAVE=$(l2_interleave) -# Peripherals / memory map -VLOG_DEFS += -DSTACK_ADDR=$(stack_addr) +# Stack / SPM (boot_addr, stack_addr, periph_start_addr, uart_addr used by hjson +# generator via environment; not consumed as SV defines) VLOG_DEFS += -DSTACK_HW_SIZE=$(stack_hw_size) VLOG_DEFS += -DSTACK_HW_DEPTH=$(stack_hw_depth) VLOG_DEFS += -DSTACK_TOT_SIZE=$(stack_tot_size) -VLOG_DEFS += -DPERIPH_START_ADDR=$(periph_start_addr) -VLOG_DEFS += -DBOOT_ADDR=$(boot_addr) -VLOG_DEFS += -DUART_ADDR=$(uart_addr) +VLOG_DEFS += -DSTACK_TOT_DEPTH=$(stack_tot_depth) ENABLE_CACHEPOOL_TESTS ?= 1 diff --git a/hardware/src/axi_hier_interco.sv b/hardware/src/axi_hier_interco.sv new file mode 100644 index 0000000..b4ab001 --- /dev/null +++ b/hardware/src/axi_hier_interco.sv @@ -0,0 +1,322 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Author: Samuel Riedel + +// Implement a hierarchical AXI interconnect. Below shows one level of the interconnect. This module +// recursively instantiates itself and creates a tree of interconnects, each node with `Radix` slave +// ports. +// +// AXI Mux Read-only ID Width +// Cache Converter +// |‾╲ +// +-------->| ╲ +// | + +-------+ +-------+ +// +-------->| M | | | | | +// | U |---->| $ |---->| > |----> +// | X | | | | | +// | + +-------+ +-------+ +// +-------->| ╱ +// |_╱ +// Internal Cache +// Slave type type type Master type + +module axi_hier_interco + import cachepool_pkg::ro_cache_ctrl_t; +#( + parameter int unsigned NumSlvPorts = 0, + parameter int unsigned NumMstPorts = 0, + parameter int unsigned Radix = 2, + parameter int unsigned EnableCache = 0, + parameter int unsigned CacheLineWidth = 0, + parameter int unsigned CacheSizeByte = 0, + parameter int unsigned CacheSets = 0, + parameter int unsigned AddrWidth = 0, + parameter int unsigned DataWidth = 0, + parameter int unsigned SlvIdWidth = 0, + parameter int unsigned MstIdWidth = 0, + parameter int unsigned UserWidth = 0, + parameter type slv_req_t = logic, + parameter type slv_resp_t = logic, + parameter type mst_req_t = logic, + parameter type mst_resp_t = logic +) ( + input logic clk_i, + input logic rst_ni, + input logic test_i, + input ro_cache_ctrl_t ro_cache_ctrl_i, + input slv_req_t [NumSlvPorts-1:0] slv_req_i, + output slv_resp_t [NumSlvPorts-1:0] slv_resp_o, + output mst_req_t [NumMstPorts-1:0] mst_req_o, + input mst_resp_t [NumMstPorts-1:0] mst_resp_i +); + + //////////////// + // Typedefs // + //////////////// + + localparam int unsigned IntIdWidth = SlvIdWidth + $clog2(NumSlvPorts); + localparam int unsigned CacheIdWidth = EnableCache[0] ? IntIdWidth + 1: IntIdWidth; + localparam int unsigned NrAddrRules = cachepool_pkg::ROCacheNumAddrRules; + + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [DataWidth-1:0] data_t; + typedef logic [DataWidth/8-1:0] strb_t; + typedef logic [SlvIdWidth-1:0] slv_id_t; + typedef logic [MstIdWidth-1:0] mst_id_t; + typedef logic [IntIdWidth-1:0] int_id_t; + typedef logic [CacheIdWidth-1:0] cache_id_t; + typedef logic [UserWidth-1:0] user_t; + + `include "axi/typedef.svh" + // Common AXI types + `AXI_TYPEDEF_W_CHAN_T(w_t, data_t, strb_t, user_t); + // Slave AXI types + `AXI_TYPEDEF_AW_CHAN_T(slv_aw_t, addr_t, slv_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(slv_b_t, slv_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(slv_ar_t, addr_t, slv_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(slv_r_t, data_t, slv_id_t, user_t); + // Intermediate AXI types + `AXI_TYPEDEF_AW_CHAN_T(int_aw_t, addr_t, int_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(int_b_t, int_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(int_ar_t, addr_t, int_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(int_r_t, data_t, int_id_t, user_t); + `AXI_TYPEDEF_REQ_T(int_req_t, int_aw_t, w_t, int_ar_t); + `AXI_TYPEDEF_RESP_T(int_resp_t, int_b_t, int_r_t ); + // Cache AXI types + `AXI_TYPEDEF_AW_CHAN_T(cache_aw_t, addr_t, cache_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(cache_b_t, cache_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(cache_ar_t, addr_t, cache_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(cache_r_t, data_t, cache_id_t, user_t); + `AXI_TYPEDEF_REQ_T(cache_req_t, cache_aw_t, w_t, cache_ar_t); + `AXI_TYPEDEF_RESP_T(cache_resp_t, cache_b_t, cache_r_t ); + + /////////////// + // Interco // + /////////////// + + // Recursive module to implement multiple hierarchy levels at once + + if (NumMstPorts > NumSlvPorts) begin : gen_error + $error("[axi_hier_interco] `NumMstPorts` must be bigger than `NumSlvPorts`."); + end else if (NumMstPorts == NumSlvPorts) begin : gen_top_level + // Top-level, connect the ports to the master ports + for (genvar i = 0; i < NumMstPorts; i++) begin : gen_bypasses + assign mst_req_o[i] = slv_req_i[i]; + assign slv_resp_o[i] = mst_resp_i[i]; + end + end else if (Radix <= 1) begin : gen_error + $error("[axi_hier_interco] `Radix` must be bigger than 1."); + end else if (NumSlvPorts > Radix) begin : gen_axi_level_recursive + // More than one level missing. --> Recursively call this module + // This level will contain `NumMuxes` interconnects + localparam int unsigned NumMuxes = NumSlvPorts / Radix; + if (NumMuxes * Radix != NumSlvPorts) begin : gen_error + $error("[axi_hier_interco] `NumSlvPorts` mod `Radix` must be 0."); + end else begin : gen_level + slv_req_t [NumMuxes-1:0] int_req; + slv_resp_t [NumMuxes-1:0] int_resp; + + for (genvar i = 0; i < NumMuxes; i++) begin : gen_lower_level + axi_hier_interco #( + .NumSlvPorts (Radix ), + .NumMstPorts (1 ), + .Radix (Radix ), + .EnableCache (EnableCache ), + .CacheLineWidth (CacheLineWidth), + .CacheSizeByte (CacheSizeByte ), + .CacheSets (CacheSets ), + .AddrWidth (AddrWidth ), + .DataWidth (DataWidth ), + .SlvIdWidth (SlvIdWidth ), + .MstIdWidth (SlvIdWidth ), + .UserWidth (UserWidth ), + .slv_req_t (slv_req_t ), + .slv_resp_t (slv_resp_t ), + .mst_req_t (slv_req_t ), + .mst_resp_t (slv_resp_t ) + ) i_axi_interco ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .ro_cache_ctrl_i (ro_cache_ctrl_i ), + .slv_req_i (slv_req_i[i*Radix +: Radix] ), + .slv_resp_o (slv_resp_o[i*Radix +: Radix]), + .mst_req_o (int_req[i] ), + .mst_resp_i (int_resp[i] ) + ); + end + + axi_hier_interco #( + .NumSlvPorts (NumMuxes ), + .NumMstPorts (NumMstPorts ), + .Radix (Radix ), + .EnableCache (EnableCache>>1), + .CacheLineWidth (CacheLineWidth), + .CacheSizeByte (CacheSizeByte ), + .CacheSets (CacheSets ), + .AddrWidth (AddrWidth ), + .DataWidth (DataWidth ), + .SlvIdWidth (SlvIdWidth ), + .MstIdWidth (MstIdWidth ), + .UserWidth (UserWidth ), + .slv_req_t (slv_req_t ), + .slv_resp_t (slv_resp_t ), + .mst_req_t (mst_req_t ), + .mst_resp_t (mst_resp_t ) + ) i_axi_interco ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .ro_cache_ctrl_i (ro_cache_ctrl_i), + .slv_req_i (int_req ), + .slv_resp_o (int_resp ), + .mst_req_o (mst_req_o ), + .mst_resp_i (mst_resp_i ) + ); + end + end else if (NumSlvPorts <= Radix && NumMstPorts == 1) begin : gen_bottom_level + + // Intermediate AXI channel + int_req_t int_req; + int_resp_t int_resp; + cache_req_t cache_req; + cache_resp_t cache_resp; + + axi_mux #( + // AXI parameter and channel types + .SlvAxiIDWidth (SlvIdWidth ), // AXI ID width, slave ports + .slv_aw_chan_t (slv_aw_t ), // AW Channel Type, slave ports + .mst_aw_chan_t (int_aw_t ), // AW Channel Type, master port + .w_chan_t (w_t ), // W Channel Type, all ports + .slv_b_chan_t (slv_b_t ), // B Channel Type, slave ports + .mst_b_chan_t (int_b_t ), // B Channel Type, master port + .slv_ar_chan_t (slv_ar_t ), // AR Channel Type, slave ports + .mst_ar_chan_t (int_ar_t ), // AR Channel Type, master port + .slv_r_chan_t (slv_r_t ), // R Channel Type, slave ports + .mst_r_chan_t (int_r_t ), // R Channel Type, master port + .slv_req_t (slv_req_t ), // Slave port request type + .slv_resp_t (slv_resp_t ), // Slave port response type + .mst_req_t (int_req_t ), // Master ports request type + .mst_resp_t (int_resp_t ), // Master ports response type + .NoSlvPorts (NumSlvPorts), // Number of slave ports + // Maximum number of outstanding transactions per write + .MaxWTrans (8 ), + // If enabled, this multiplexer is purely combinatorial + .FallThrough (1'b0 ), + // add spill register on write master ports, adds a cycle latency on write channels + .SpillAw (1'b1 ), + .SpillW (1'b1 ), + .SpillB (1'b1 ), + // add spill register on read master ports, adds a cycle latency on read channels + .SpillAr (1'b1 ), + .SpillR (1'b1 ) + ) i_axi_mux ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .slv_reqs_i (slv_req_i ), + .slv_resps_o (slv_resp_o), + .mst_req_o (int_req ), + .mst_resp_i (int_resp ) + ); + + if (EnableCache[0]) begin: gen_ro_cache + localparam int unsigned LineCount = CacheSizeByte/(CacheSets*CacheLineWidth/8); + snitch_read_only_cache #( + .LineWidth (CacheLineWidth), + .LineCount (LineCount ), + .WayCount (CacheSets ), + .AxiAddrWidth (AddrWidth ), + .AxiDataWidth (DataWidth ), + .AxiIdWidth (IntIdWidth ), + .AxiUserWidth (UserWidth ), + .MaxTrans (32'd16 ), + .NrAddrRules (NrAddrRules ), + .SerialLookup (0 ), + .slv_req_t (int_req_t ), + .slv_rsp_t (int_resp_t ), + .mst_req_t (cache_req_t ), + .mst_rsp_t (cache_resp_t ) + ) i_snitch_read_only_cache ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .enable_i (ro_cache_ctrl_i.enable ), + .flush_valid_i (ro_cache_ctrl_i.flush_valid), + .flush_ready_o (/* unused */ ), + .icache_events_o (/* unused */ ), + .start_addr_i (ro_cache_ctrl_i.start_addr ), + .end_addr_i (ro_cache_ctrl_i.end_addr ), + .axi_slv_req_i (int_req ), + .axi_slv_rsp_o (int_resp ), + .axi_mst_req_o (cache_req ), + .axi_mst_rsp_i (cache_resp ), + .sram_cfg_data_i ('0 ), + .sram_cfg_tag_i ('0 ), + .sram_cfg_out_data_o (/* unused */ ), + .sram_cfg_out_tag_o (/* unused */ ) + ); + end else begin: gen_no_ro_cache + assign cache_req = int_req; + assign int_resp = cache_resp; + end + + axi_id_remap #( + .AxiSlvPortIdWidth (CacheIdWidth ), + .AxiSlvPortMaxUniqIds (2**MstIdWidth), + .AxiMaxTxnsPerId (8 ), + .AxiMstPortIdWidth (MstIdWidth ), + .slv_req_t (cache_req_t ), + .slv_resp_t (cache_resp_t ), + .mst_req_t (mst_req_t ), + .mst_resp_t (mst_resp_t ) + ) i_axi_id_remap ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (cache_req ), + .slv_resp_o (cache_resp), + .mst_req_o (mst_req_o ), + .mst_resp_i (mst_resp_i) + ); + + // Check all the AXI widths + if ($bits(slv_req_i[0].aw.addr) != AddrWidth) + $error("[axi_hier_interco] `slv_req_i.aw.addr` does not match AddrWidth."); + if ($bits(slv_req_i[0].w.data) != DataWidth) + $error("[axi_hier_interco] `slv_req_i.w.data` does not match DataWidth."); + if ($bits(slv_req_i[0].aw.id) != SlvIdWidth) + $error("[axi_hier_interco] `slv_req_i.aw.id` does not match SlvIdWidth."); + if ($bits(slv_req_i[0].aw.user) != UserWidth) + $error("[axi_hier_interco] `slv_req_i.aw.user` does not match UserWidth."); + + if ($bits(mst_req_o[0].aw.addr) != AddrWidth) + $error("[axi_hier_interco] `mst_req_o.aw.addr` does not match AddrWidth."); + if ($bits(mst_req_o[0].w.data) != DataWidth) + $error("[axi_hier_interco] `mst_req_o.w.data` does not match DataWidth."); + if ($bits(mst_req_o[0].aw.id) != MstIdWidth) + $error("[axi_hier_interco] `mst_req_o.aw.id` does not match MstIdWidth."); + if ($bits(mst_req_o[0].aw.user) != UserWidth) + $error("[axi_hier_interco] `mst_req_o.aw.user` does not match UserWidth."); + + if ($bits(int_req.aw.addr) != AddrWidth) + $error("[axi_hier_interco] `int_req.aw.addr` does not match AddrWidth."); + if ($bits(int_req.w.data) != DataWidth) + $error("[axi_hier_interco] `int_req.w.data` does not match DataWidth."); + if ($bits(int_req.aw.id) != IntIdWidth) + $error("[axi_hier_interco] `int_req.aw.id` does not match IntIdWidth."); + if ($bits(int_req.aw.user) != UserWidth) + $error("[axi_hier_interco] `int_req.aw.user` does not match UserWidth."); + + if ($bits(cache_req.aw.addr) != AddrWidth) + $error("[axi_hier_interco] `cache_req.aw.addr` does not match AddrWidth."); + if ($bits(cache_req.w.data) != DataWidth) + $error("[axi_hier_interco] `cache_req.w.data` does not match DataWidth."); + if ($bits(cache_req.aw.id) != CacheIdWidth) + $error("[axi_hier_interco] `cache_req.aw.id` does not match CacheIdWidth."); + if ($bits(cache_req.aw.user) != UserWidth) + $error("[axi_hier_interco] `cache_req.aw.user` does not match UserWidth."); + end else begin: gen_error + $error("[axi_hier_interco] Cannot build a tree with those parameters."); + end +endmodule diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index 0a0798d..1e6e5d4 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -14,7 +14,7 @@ module cachepool_group import spatz_pkg::*; import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; - import snitch_icache_pkg::icache_events_t; + import snitch_icache_pkg::icache_l1_events_t; #( /// Width of physical address. parameter int unsigned AxiAddrWidth = 48, @@ -136,7 +136,7 @@ module cachepool_group input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, /// Peripheral signals - output icache_events_t [NrCores-1:0] icache_events_o, + output icache_l1_events_t [NrCores-1:0] icache_events_o, input logic icache_prefetch_enable_i, input logic [NrCores-1:0] cl_interrupt_i, input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, @@ -302,9 +302,14 @@ module cachepool_group cache_trans_req_t [NumL1CacheCtrlLocal-1:0] cache_refill_req; cache_trans_rsp_t [NumL1CacheCtrlLocal-1:0] cache_refill_rsp; - // cache_core_req/rsp: icache-bypass path, one per tile (from axi_to_reqrsp) - cache_trans_req_t [NumTilesPerGroup-1:0] cache_core_req; - cache_trans_rsp_t [NumTilesPerGroup-1:0] cache_core_rsp; + // L2 Group ICache AXI master output (from axi_hier_interco) + axi_mst_cache_req_t axi_l2icache_mst_req; + axi_mst_cache_resp_t axi_l2icache_mst_rsp; + // L2 Group ICache reqrsp output (to xbar port 0) + cache_trans_req_t cache_l2icache_req; + cache_trans_rsp_t cache_l2icache_rsp; + // L2 Group ICache control (hardwired) + ro_cache_ctrl_t l2icache_ctrl; // Flat xbar input channels: NumTilesPerGroup * NumClusterMst ports cache_trans_req_chan_t [NumTilesPerGroup*NumClusterMst-1:0] tile_req_chan; @@ -339,34 +344,69 @@ module cachepool_group end // --------------------- - // axi_to_reqrsp: TileMem (icache-bypass) path, one per tile + // L2 Group ICache: 4-to-1 AXI mux + read-only cache + ID remap // --------------------- - for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_axi_converter - axi_to_reqrsp #( - .axi_req_t ( axi_mst_cache_req_t ), - .axi_rsp_t ( axi_mst_cache_resp_t ), - .AddrWidth ( AxiAddrWidth ), - .DataWidth ( AxiDataWidth ), - .UserWidth ( $bits(refill_user_t) ), - .IdWidth ( AxiIdWidthIn ), - .BufDepth ( NumSpatzOutstandingLoads ), - .reqrsp_req_t ( cache_trans_req_t ), - .reqrsp_rsp_t ( cache_trans_rsp_t ) - ) i_axi2reqrsp ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .busy_o ( ), - .axi_req_i ( axi_tile_mem_req[t] ), - .axi_rsp_o ( axi_tile_mem_rsp[t] ), - .reqrsp_req_o ( cache_core_req[t] ), - .reqrsp_rsp_i ( cache_core_rsp[t] ) - ); + always_comb begin + l2icache_ctrl = '0; + l2icache_ctrl.enable = 1'b1; + l2icache_ctrl.flush_valid = 1'b0; + l2icache_ctrl.start_addr[0] = DramAddr; + l2icache_ctrl.end_addr[0] = DramAddr + DramSize; end + axi_hier_interco #( + .NumSlvPorts ( NumTilesPerGroup ), + .NumMstPorts ( 1 ), + .Radix ( NumTilesPerGroup ), + .EnableCache ( 1 ), + .CacheLineWidth ( L2ICacheLineWidth ), + .CacheSizeByte ( L2ICacheSizeByte ), + .CacheSets ( L2ICacheSets ), + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .SlvIdWidth ( WideIdWidthIn ), + .MstIdWidth ( WideIdWidthIn ), + .UserWidth ( AxiUserWidth ), + .slv_req_t ( axi_mst_cache_req_t ), + .slv_resp_t ( axi_mst_cache_resp_t ), + .mst_req_t ( axi_mst_cache_req_t ), + .mst_resp_t ( axi_mst_cache_resp_t ) + ) i_l2icache_interco ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( 1'b0 ), + .ro_cache_ctrl_i ( l2icache_ctrl ), + .slv_req_i ( axi_tile_mem_req ), + .slv_resp_o ( axi_tile_mem_rsp ), + .mst_req_o ( axi_l2icache_mst_req ), + .mst_resp_i ( axi_l2icache_mst_rsp ) + ); + + // Single axi_to_reqrsp for the L2 ICache master output + axi_to_reqrsp #( + .axi_req_t ( axi_mst_cache_req_t ), + .axi_rsp_t ( axi_mst_cache_resp_t ), + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .UserWidth ( $bits(refill_user_t) ), + .IdWidth ( WideIdWidthIn ), + .BufDepth ( NumSpatzOutstandingLoads ), + .reqrsp_req_t ( cache_trans_req_t ), + .reqrsp_rsp_t ( cache_trans_rsp_t ) + ) i_l2icache_axi2reqrsp ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .busy_o ( ), + .axi_req_i ( axi_l2icache_mst_req ), + .axi_rsp_o ( axi_l2icache_mst_rsp ), + .reqrsp_req_o ( cache_l2icache_req ), + .reqrsp_rsp_i ( cache_l2icache_rsp ) + ); + // --------------------- // Wiring: assemble flat xbar input from icache-bypass and refill paths // --------------------- - // Port layout per tile: p=0 -> icache-bypass (cache_core_req), + // Port layout per tile: p=0 -> L2 ICache output (t=0) or unused (t>0), // p=1..NumL1CtrlTile -> refill (cache_refill_req) localparam int unsigned ReqrspPortsTile = NumL1CtrlTile + 1; always_comb begin @@ -376,16 +416,23 @@ module cachepool_group automatic int unsigned refill_idx = t * NumL1CtrlTile + p - 1; if (p == 0) begin - // icache-bypass path - tile_req_chan [xbar_idx] = cache_core_req[t].q; - tile_req_chan [xbar_idx].addr = scrambleAddr(cache_core_req[t].q.addr); - tile_req_valid [xbar_idx] = cache_core_req[t].q_valid; - cache_core_rsp [t].q_ready = tile_req_ready[xbar_idx]; - - cache_core_rsp [t].p = tile_rsp_chan [xbar_idx]; - cache_core_rsp [t].p_valid = tile_rsp_valid[xbar_idx]; - tile_rsp_ready [xbar_idx] = cache_core_req[t].p_ready; - tile_req_chan [xbar_idx].user.tile_id = t; + if (t == 0) begin + // L2 ICache output → xbar port 0 + tile_req_chan [xbar_idx] = cache_l2icache_req.q; + tile_req_chan [xbar_idx].addr = scrambleAddr(cache_l2icache_req.q.addr); + tile_req_valid [xbar_idx] = cache_l2icache_req.q_valid; + cache_l2icache_rsp.q_ready = tile_req_ready[xbar_idx]; + + cache_l2icache_rsp.p = tile_rsp_chan [xbar_idx]; + cache_l2icache_rsp.p_valid = tile_rsp_valid[xbar_idx]; + tile_rsp_ready [xbar_idx] = cache_l2icache_req.p_ready; + tile_req_chan [xbar_idx].user.tile_id = '0; + end else begin + // unused icache-bypass ports (tiles 1-3) + tile_req_chan [xbar_idx] = '0; + tile_req_valid [xbar_idx] = 1'b0; + tile_rsp_ready [xbar_idx] = 1'b0; + end end else begin // refill path tile_req_chan [xbar_idx] = cache_refill_req[refill_idx].q; diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv index 710038e..85ea868 100644 --- a/hardware/src/cachepool_group_noc_wrapper.sv +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -15,7 +15,7 @@ module cachepool_group_noc_wrapper import spatz_pkg::*; import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; - import snitch_icache_pkg::icache_events_t; + import snitch_icache_pkg::icache_l1_events_t; #( parameter int unsigned AxiAddrWidth = 48, parameter int unsigned AxiDataWidth = 512, @@ -70,7 +70,7 @@ module cachepool_group_noc_wrapper input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, - output icache_events_t [NrCores-1:0] icache_events_o, + output icache_l1_events_t [NrCores-1:0] icache_events_o, input logic icache_prefetch_enable_i, input logic [NrCores-1:0] cl_interrupt_i, input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index ab08758..655509e 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -96,6 +96,12 @@ package cachepool_pkg; localparam int unsigned ICacheLineCount = 128; localparam int unsigned ICacheSets = 4; + // Group-level L2 ICache (shared read-only cache, primarily for coalescing) + localparam int unsigned L2ICacheLineWidth = 128; + localparam int unsigned L2ICacheSets = 4; + localparam int unsigned L2ICacheSizeByte = 65536; + localparam int unsigned L2ICacheLineCount = L2ICacheSizeByte / (L2ICacheSets * L2ICacheLineWidth / 8); + // Be careful on unsigned long int passed in from configuration. // Currently use fixed values. localparam int unsigned TCDMStartAddr = 32'hBFFF_F800; @@ -479,8 +485,8 @@ package cachepool_pkg; typedef struct packed { logic enable; logic flush_valid; - logic [ROCacheNumAddrRules-1:0][AddrWidth-1:0] start_addr; - logic [ROCacheNumAddrRules-1:0][AddrWidth-1:0] end_addr; + axi_addr_t [ROCacheNumAddrRules-1:0] start_addr; + axi_addr_t [ROCacheNumAddrRules-1:0] end_addr; } ro_cache_ctrl_t; diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 24adf18..658a3ed 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -16,7 +16,7 @@ module cachepool_tile import spatz_pkg::*; import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; - import snitch_icache_pkg::icache_events_t; + import snitch_icache_pkg::icache_l1_events_t; #( /// Width of physical address. parameter int unsigned AxiAddrWidth = 48, @@ -163,7 +163,7 @@ module cachepool_tile input remote_group_req_t [TotRGPorts:0] remote_group_req_i, output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, /// Peripheral signals - output icache_events_t [NrCores-1:0] icache_events_o, + output icache_l1_events_t [NrCores-1:0] icache_events_o, input logic icache_prefetch_enable_i, input logic [NrCores-1:0] cl_interrupt_i, input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, @@ -400,7 +400,7 @@ module cachepool_tile core_events_t [NrCores-1:0] core_events; - snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; + // snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; // 4. Memory Subsystem (Core side). reqrsp_req_t [NrCores-1:0] core_req, filtered_core_req; @@ -1474,7 +1474,7 @@ module cachepool_tile .L0_LINE_COUNT ( 8 ), .LINE_WIDTH ( ICacheLineWidth ), .LINE_COUNT ( ICacheLineCount ), - .SET_COUNT ( ICacheSets ), + .WAY_COUNT ( ICacheSets ), .FETCH_AW ( AxiAddrWidth ), .FETCH_DW ( 32 ), .FILL_AW ( AxiAddrWidth ), @@ -1491,7 +1491,9 @@ module cachepool_tile .clk_d2_i ( clk_i ), .rst_ni ( rst_ni ), .enable_prefetching_i ( icache_prefetch_enable_i ), - .icache_events_o ( icache_events_o ), + .enable_branch_pred_i ( '0 ), + .icache_l0_events_o ( ), + .icache_l1_events_o ( ), .flush_valid_i ( flush_valid ), .flush_ready_o ( flush_ready ), .inst_addr_i ( inst_addr ), @@ -1502,6 +1504,8 @@ module cachepool_tile .inst_error_o ( inst_error ), .sram_cfg_tag_i ( '0 ), .sram_cfg_data_i ( '0 ), + .sram_cfg_out_data_o (), + .sram_cfg_out_tag_o (), .axi_req_o ( wide_axi_mst_req[ICache] ), .axi_rsp_i ( wide_axi_mst_rsp[ICache] ) ); From 0281808e44c36ef579ed12aed64a32f781f20f34 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 22 May 2026 16:37:11 +0200 Subject: [PATCH 26/37] [CI] Update CI flow. --- .gitlab-ci.yml | 78 +++++++++++++++++++++++++++++++++++++++--------- requirements.txt | 8 +++++ sim/sim.mk | 11 +++++-- 3 files changed, 81 insertions(+), 16 deletions(-) create mode 100644 requirements.txt diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 567242b..85cda79 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,29 +8,32 @@ variables: GIT_SUBMODULE_STRATEGY: none ROOT_DIR: '$CI_PROJECT_DIR' APPS: "tests" - PATH: '/home/gitlabci/.cargo/bin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/condor/bin:/usr/sepp/bin:$CI_PROJECT_DIR/install/verilator/bin:/home/gitlabci/.local/bin' + PATH: '$HOME/.cargo/bin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/condor/bin:/usr/sepp/bin:$CI_PROJECT_DIR/install/verilator/bin:$HOME/.local/bin' OBJCACHE: '' CC: '/usr/pack/gcc-11.2.0-af/linux-x64/bin/gcc' CXX: '/usr/pack/gcc-11.2.0-af/linux-x64/bin/g++' CMAKE: 'cmake-3.28.3' python: 'python3' python3: 'python3' + # Config to build and test + CI_CONFIG: 'cachepool_fpu_2g' + SW_PREFIX: 'test-cachepool-' default: - tags: [dolent] + tags: [shared] stages: - build + - test -.base: - artifacts: - when: always - expire_in: 1 day - -build-vsim: - extends: .base +# --------------------------------------------------------------------------- +# Build stage: compile RTL and software for CI_CONFIG. +# Parallel jobs within the same pipeline share $HOME, so the toolchain +# installed by make quick-tool is automatically available to all test jobs. +# --------------------------------------------------------------------------- +build: stage: build - timeout: 5h + timeout: 4h 30m script: - echo "Using CC=$CC" - echo "Using CXX=$CXX" @@ -39,10 +42,57 @@ build-vsim: - make quick-tool - make init - make dram-build - - cd util/auto-benchmark - - chmod +x ./run_ci.sh - - ./run_ci.sh + - python3 -m pip install --quiet -r requirements.txt + - make clean generate vsim config=$CI_CONFIG + artifacts: + when: always + expire_in: 1 day + paths: + # QuestaSim compiled work library + - sim/work/ + # vsim wrapper scripts (exclude sim/bin/logs/ — not needed by test jobs) + - sim/bin/cachepool_cluster.vsim + - sim/bin/cachepool_cluster.vsim.gui + # DPI shared library + - sim/work-dpi/ + # Software binaries for all kernels + - software/build/CachePoolTests/ + # DRAMSys shared libraries and config files (referenced by vsim at runtime) + - hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/build/lib/ + - hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/configs/ +# --------------------------------------------------------------------------- +# Test stage: run each kernel in parallel on a separate runner. +# Each job downloads the build artifacts, runs one simulation, and checks +# the output log for failures. +# --------------------------------------------------------------------------- +test: + stage: test + timeout: 1h + needs: [build] + parallel: + matrix: + - KERNEL: + - spin-lock + - load-store_M16 + - fdotp-32b_M32768 + - gemv_M512_N128_K32 + - fmatmul-32b_M64_N64_K64 + - fft-32b_M1024_N16 + - multi_producer_single_consumer_double_linked_list_M1_N1350_K10 + - byte-enable + script: + # The vsim script writes a .rtlbinary marker here; ensure the dir exists. + - mkdir -p sim/bin/logs + - chmod +x sim/bin/cachepool_cluster.vsim + - BIN="${SW_PREFIX}${KERNEL}" + - sim/bin/cachepool_cluster.vsim software/build/CachePoolTests/$BIN 2>&1 | tee test_${KERNEL}.log + - python3 util/auto-benchmark/check-ci.py test_${KERNEL}.log artifacts: + when: always + expire_in: 1 day paths: - - util/auto-benchmark/logs + # Full simulation log + - test_*.log + # Performance-monitor trace files written by the simulator + - sim/bin/logs/ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..216cd0d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +# Python packages required for hardware code generation (make generate). +# dataclasses is a stdlib backport needed on Python 3.6; it is a no-op on 3.7+. +dataclasses +hjson +jsonref +jsonschema +mako +termcolor diff --git a/sim/sim.mk b/sim/sim.mk index 56cee77..9b106b5 100644 --- a/sim/sim.mk +++ b/sim/sim.mk @@ -100,17 +100,24 @@ ${WORK_DIR}/compile.vsim.tcl: ${SNLIB_DIR}/rtl_lib.cc ${SNLIB_DIR}/common_lib.cc echo 'return 0' >> $@ # Wrapper script & GUI script +# The generated scripts derive ROOT_DIR from their own location at runtime so +# that they remain portable across different checkout paths (CI runners, moved +# repos). All absolute paths baked in by make are replaced by a single sed pass. define QUESTASIM ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log @! grep -P "Errors: [1-9]*," $(dir $<)vsim.log @mkdir -p $(SIMBIN_DIR) $(SIMBIN_DIR)/logs - @echo "#!/bin/bash" > $(SIMBIN_DIR)/cachepool_cluster.vsim + @echo '#!/bin/bash' > $(SIMBIN_DIR)/cachepool_cluster.vsim + @echo 'ROOT_DIR="$$(cd "$$(dirname "$$(readlink -f "$$0")")/../.." && pwd)"' >> $(SIMBIN_DIR)/cachepool_cluster.vsim @echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $(SIMBIN_DIR)/cachepool_cluster.vsim @echo '${VSIM} +permissive ${VSIM_FLAGS} -do "run -a" -work ${WORK_DIR} -c -ldflags "-Wl,-rpath,${GCC_LIB} -L${FESVR}/lib -lfesvr_vsim -lutil" $1 +permissive-off ++$$1 +PRELOAD=$$1' >> $(SIMBIN_DIR)/cachepool_cluster.vsim + @sed -i 's|$(CACHEPOOL_DIR)|$${ROOT_DIR}|g' $(SIMBIN_DIR)/cachepool_cluster.vsim @chmod +x $(SIMBIN_DIR)/cachepool_cluster.vsim - @echo "#!/bin/bash" > $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @echo '#!/bin/bash' > $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @echo 'ROOT_DIR="$$(cd "$$(dirname "$$(readlink -f "$$0")")/../.." && pwd)"' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @echo '${VSIM} +permissive ${VSIM_FLAGS} -do "log -r /*; source ${WAVE_FILE}; run -a" -work ${WORK_DIR} -ldflags "-Wl,-rpath,${GCC_LIB} -L${FESVR}/lib -lfesvr_vsim -lutil" $1 +permissive-off ++$$1 +PRELOAD=$$1' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @sed -i 's|$(CACHEPOOL_DIR)|$${ROOT_DIR}|g' $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @chmod +x $(SIMBIN_DIR)/cachepool_cluster.vsim.gui endef From 5ee37903f0dd70aa82fc6cc54fe8d38393c32e53 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Tue, 26 May 2026 09:31:11 +0200 Subject: [PATCH 27/37] [Bender] Update Spatz version. --- Bender.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Bender.lock b/Bender.lock index b547e3a..cff1d5f 100644 --- a/Bender.lock +++ b/Bender.lock @@ -150,7 +150,7 @@ packages: dependencies: - tech_cells_generic spatz: - revision: ed25c78dd72d839db8141287f9516d78ee399b93 + revision: 61bc09f805db7f7ae6c1ab99d3da63b0a8431281 version: null source: Git: https://github.com/pulp-platform/spatz.git From f3f4892cd03b7795869770424fde4c92b69739d6 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Tue, 26 May 2026 09:39:47 +0200 Subject: [PATCH 28/37] [MISC] Update README and fixing some coding style. --- .gitlab-ci.yml | 4 +- Makefile | 3 ++ README.md | 72 +++++++++++++++++----------------- hardware/src/cachepool_tile.sv | 72 +++++++++++++++++----------------- 4 files changed, 77 insertions(+), 74 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 85cda79..b54b177 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -46,7 +46,7 @@ build: - make clean generate vsim config=$CI_CONFIG artifacts: when: always - expire_in: 1 day + expire_in: 2h paths: # QuestaSim compiled work library - sim/work/ @@ -90,7 +90,7 @@ test: - python3 util/auto-benchmark/check-ci.py test_${KERNEL}.log artifacts: when: always - expire_in: 1 day + expire_in: 1 week paths: # Full simulation log - test_*.log diff --git a/Makefile b/Makefile index cecd7a9..9bd24f0 100644 --- a/Makefile +++ b/Makefile @@ -386,6 +386,9 @@ help: @echo "*generate*: generate the Spatz package and opcodes, and the cluster config HJSON" @echo "*cache-init*: source the insitu-cache environment (requires bender checkout)" @echo "*bootrom*: compile and generate the bootrom SystemVerilog module" + @echo "*update-floonoc*: regenerate FlooNoC package from FLOO_CFG (run after changing group count)" + @echo "*install-floogen*: install the floogen Python tool (required by update-floonoc)" + @echo "*clean-floonoc*: remove the generated FlooNoC package" @echo "" @echo "DRAMSys:" @echo "" diff --git a/README.md b/README.md index 48c83e9..d00a967 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,11 @@ CachePool is a Snitch–Spatz–based many-core system with a shared L1 data cac | Level | Module | Description | |-------|--------|-------------| | 1 | Core Complex (CC) | One 32-bit Snitch + one Spatz RVV accelerator | -| 2 | Tile | 4 CCs + 4 × 64 KiB 4-way InSitu-Cache banks | -| 3 | Group | 4 Tiles connected via crossbar | -| 4 | Cluster (WIP) | Multiple Groups connected via NoC (currently one Group) | +| 2 | Tile | 4 CCs + 4 × InSitu-Cache banks | +| 3 | Group | 4 Tiles connected via crossbar + shared L2 ICache | +| 4 | Cluster | Multiple Groups connected via FlooNoC XY mesh | -All tiles in a cluster share one unified L1 cache, interleaved across cache banks. The bank-selection offset is configurable at runtime via `l1d_xbar_config(...)`. +All tiles across all groups share one unified L1 data cache, interleaved across cache banks. The bank-selection offset is configurable at runtime via `l1d_xbar_config(...)`. ## Requirements @@ -68,10 +68,10 @@ make dram-build CMAKE=/path/to/cmake-3.28.x CC=/path/to/gcc-11.2 CXX=/path/to/g+ ### Generate Required RTL Some RTL components (e.g., package headers) must be generated prior to simulation. -Generation requires specifying a **configuration**. If none is provided, the default is `cachepool_512`. +Generation requires specifying a **configuration**. If none is provided, the default is `cachepool_2g`. ```bash -make generate config=cachepool_fpu_512 +make generate config=cachepool_fpu_2g ``` ### Build the BootROM @@ -79,7 +79,7 @@ make generate config=cachepool_fpu_512 The BootROM is built separately from the RTL generation step: ```bash -make bootrom config=cachepool_fpu_512 +make bootrom config=cachepool_fpu_2g ``` ### Compilation and Simulation @@ -87,13 +87,13 @@ make bootrom config=cachepool_fpu_512 #### Build Software Only ```bash -make sw config=cachepool_fpu_512 +make sw config=cachepool_fpu_2g ``` #### Build Hardware + Software (QuestaSim) ```bash -make vsim config=cachepool_fpu_512 +make vsim config=cachepool_fpu_2g ``` #### Run the Simulation @@ -125,7 +125,7 @@ A lightweight benchmarking automation flow is provided under `util/auto-benchmar 1. Edit `configs.sh` to list the desired configurations and kernels: - CONFIGS="cachepool_fpu_512 cachepool_fpu_256 cachepool_fpu_128" + CONFIGS="cachepool_fpu_2g cachepool_fpu_4g" KERNELS="fdotp-32b_M32768 ffft-64b_M16384 fmatmul-64b_M2048" PREFIX="test-cachepool-" ROOT_PATH=../.. @@ -147,10 +147,10 @@ A lightweight benchmarking automation flow is provided under `util/auto-benchmar Example directory after a run: logs/20251028-1230/ - ├── cachepool_fpu_512_fdotp-32b_M32768.log - ├── cachepool_fpu_512_fdotp-32b_M32768_pm/ - ├── cachepool_fpu_512_summary.txt - ├── cachepool_fpu_256_summary.txt + ├── cachepool_fpu_2g_fdotp-32b_M32768.log + ├── cachepool_fpu_2g_fdotp-32b_M32768_pm/ + ├── cachepool_fpu_2g_summary.txt + ├── cachepool_fpu_4g_summary.txt └── ... Each run includes: @@ -170,46 +170,45 @@ This setup allows quick reproducible benchmarks with all results neatly organize Usage: ```bash -python3 check_ci.py logs/latest/cachepool_fpu_512_load-store.log +python3 check_ci.py logs/latest/cachepool_fpu_2g_load-store.log ``` Exit code 0 means all tests passed; exit code 1 means at least one failure was detected. On failure the offending lines and their line numbers are printed for manual inspection. ## Configurations -All hardware knobs live in **`config/config.mk`** (and flavor files it includes). The default configuration is **4 tiles, 16 cores**. +All hardware knobs live in **`config/config.mk`** (and flavor files it includes). The default configuration is **2 groups, 4 tiles/group, 4 cores/tile = 32 cores total**. -| Flavor file | Description | -|-------------|-------------| -| `cachepool.mk` | No floating-point support | -| `cachepool_fpu.mk` | Enables single/half precision in the Spatz vector core | +Configuration names encode the number of groups and whether the FPU is enabled: -Available named configurations (passed as `config=`): - -| Name | Cacheline | FPU | -|------|-----------|-----| -| `cachepool_512` | 512b | No | -| `cachepool_128` | 128b | No | -| `cachepool_fpu_512` | 512b | Yes | -| `cachepool_fpu_256` | 256b | Yes | -| `cachepool_fpu_128` | 128b | Yes | +| Name | Groups | Mesh | FPU | Cores | +|------|--------|------|-----|-------| +| `cachepool_2g` | 2 | 1×2 | No | 32 | +| `cachepool_fpu_2g` | 2 | 1×2 | Yes | 32 | +| `cachepool_4g` | 4 | 2×2 | No | 64 | +| `cachepool_fpu_4g` | 4 | 2×2 | Yes | 64 | +| `cachepool_fpu_16g` | 16 | 4×4 | Yes | 256 | The Spatz cluster consumes **`config/cachepool.hjson`**, which is **generated** from: - `config/cachepool.hjson.tmpl` (skeleton with comments) - `config/config.mk` (source of truth) -To switch flavors, set `config=` (or export `CACHEPOOL_CONFIGURATION=`), then rebuild: +Multi-group configurations also require a FlooNoC topology file (e.g. `config/floonoc_cachepool_4g.yml`). After changing the group count, regenerate the FlooNoC package: ```bash -make clean -make generate config=cachepool_fpu_512 +make update-floonoc ``` -> `make clean` is recommended when changing configurations. +To switch configurations, always clean first: + +```bash +make clean +make generate config=cachepool_fpu_2g +``` ### How configuration flows -1. **`config/config.mk`** defines all parameters (e.g., `num_tiles`, `num_cores`, `l1d_cacheline_width`, `axi_user_width`, addresses, etc.). Derived values (like `axi_user_width`) are pre-computed so tools receive integers, not expressions. +1. **`config/config.mk`** defines all parameters (e.g., `num_groups`, `num_groups_x`, `num_tiles_per_group`, `num_cores_per_tile`, `l1d_cacheline_width`, `axi_user_width`, etc.). Derived values are pre-computed so tools receive integers, not expressions. 2. `make generate` calls the Python generator to produce **`config/cachepool.hjson`** from the template. 3. The Makefile passes the same values to **QuestaSim** via `VLOG_DEFS`, keeping RTL, sim, and HJSON in sync. @@ -319,7 +318,7 @@ Cluster peripherals (including the BootROM and memory-mapped registers) are inst SpyGlass lint (optional): ```bash -make lint config=cachepool_fpu_512 +make lint config=cachepool_fpu_2g ``` --- @@ -328,6 +327,7 @@ make lint config=cachepool_fpu_512 - To see the exact macros passed to vlog, check `VLOG_DEFS` in the Makefile and `sim/work/compile.vsim.tcl`. - If you change cacheline width, `AXI_USER_WIDTH` is derived (supported widths: 128→19, 256→18, 512→17). Unsupported widths error out at generation time. -- Use `make clean` when switching flavors/configs to prevent stale build artifacts. +- When changing the number of groups, run `make update-floonoc` to regenerate the FlooNoC package before `make generate`. +- Use `make clean` when switching configs to prevent stale build artifacts. - Runtime functions `snrt_tile_id()` and `snrt_num_tiles()` are available to query tile topology from software. - Changing the partition mode or boundary address while the cache holds valid data requires a flush (`l1d_flush()` or the appropriate partition flush) before reconfiguring. diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 658a3ed..ece3900 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -106,72 +106,72 @@ module cachepool_tile localparam int unsigned TotRGPorts = (NumRemoteGroupPortCore == 0) ? 0 : NumRemoteGroupPortCore*NrTCDMPortsPerCore-1 ) ( /// System clock. - input logic clk_i, + input logic clk_i, /// Asynchronous active high reset. This signal is assumed to be _async_. - input logic rst_ni, + input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic debug_req_i, + input logic debug_req_i, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. - input logic [9:0] hart_base_id_i, + input logic [9:0] hart_base_id_i, /// Base address of cluster. TCDM and cluster peripheral location are derived from /// it. This signal is pseudo-static. - input axi_addr_t cluster_base_addr_i, + input axi_addr_t cluster_base_addr_i, /// Tile ID, internal ID, the base is always 0, in theory should not change during use - input remote_tile_sel_t tile_id_i, + input remote_tile_sel_t tile_id_i, /// Partitioning address - input axi_addr_t private_start_addr_i, + input axi_addr_t private_start_addr_i, /// AXI Narrow out-port (UART/Peripheral) - output axi_narrow_req_t [1:0] axi_out_req_o, - input axi_narrow_resp_t [1:0] axi_out_resp_i, + output axi_narrow_req_t [1:0] axi_out_req_o, + input axi_narrow_resp_t [1:0] axi_out_resp_i, /// Cache Refill ports - output cache_trans_req_t [NumL1CtrlTile-1:0] cache_refill_req_o, - input cache_trans_rsp_t [NumL1CtrlTile-1:0] cache_refill_rsp_i, + output cache_trans_req_t [NumL1CtrlTile-1:0] cache_refill_req_o, + input cache_trans_rsp_t [NumL1CtrlTile-1:0] cache_refill_rsp_i, /// Wide AXI ports to cluster level - output axi_out_req_t [TileNarrowAxiPorts-1:0] axi_wide_req_o, - input axi_out_resp_t [TileNarrowAxiPorts-1:0] axi_wide_rsp_i, + output axi_out_req_t [TileNarrowAxiPorts-1:0] axi_wide_req_o, + input axi_out_resp_t [TileNarrowAxiPorts-1:0] axi_wide_rsp_i, /// Remote Tile access ports (to remote tiles) - output tcdm_req_t [NumRemotePortTile-1:0] remote_req_o, - output remote_tile_sel_t [NumRemotePortTile-1:0] remote_req_dst_o, - input tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_i, - input logic [NumRemotePortTile-1:0] remote_rsp_ready_i, + output tcdm_req_t [NumRemotePortTile-1:0] remote_req_o, + output remote_tile_sel_t [NumRemotePortTile-1:0] remote_req_dst_o, + input tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_i, + input logic [NumRemotePortTile-1:0] remote_rsp_ready_i, /// Remote Tile access ports (from remote tiles) - input tcdm_req_t [NumRemotePortTile-1:0] remote_req_i, - output tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_o, - output logic [NumRemotePortTile-1:0] remote_rsp_ready_o, + input tcdm_req_t [NumRemotePortTile-1:0] remote_req_i, + output tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_o, + output logic [NumRemotePortTile-1:0] remote_rsp_ready_o, /// Inter-group remote access ports (to other groups). /// Flat layout: flat index = j + r * NrTCDMPortsPerCore, /// where j is the interco instance and r is the inter-group remote slot. /// Total count: NumRemoteGroupPortCore * NrTCDMPortsPerCore. /// Uses REQRSP-style types with built-in ready and remote_group_user_t. - output remote_group_req_t [TotRGPorts:0] remote_group_req_o, - input remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_i, + output remote_group_req_t [TotRGPorts:0] remote_group_req_o, + input remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_i, /// Inter-group remote access ports (from other groups) - input remote_group_req_t [TotRGPorts:0] remote_group_req_i, - output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, + input remote_group_req_t [TotRGPorts:0] remote_group_req_i, + output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, /// Peripheral signals - output icache_l1_events_t [NrCores-1:0] icache_events_o, - input logic icache_prefetch_enable_i, - input logic [NrCores-1:0] cl_interrupt_i, - input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, - input cache_insn_t l1d_insn_i, - input logic [3:0] l1d_private_i, - input logic l1d_insn_valid_i, - output logic l1d_insn_ready_o, - input logic l1d_busy_i, + output icache_l1_events_t [NrCores-1:0] icache_events_o, + input logic icache_prefetch_enable_i, + input logic [NrCores-1:0] cl_interrupt_i, + input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, + input cache_insn_t l1d_insn_i, + input logic [3:0] l1d_private_i, + input logic l1d_insn_valid_i, + output logic l1d_insn_ready_o, + input logic l1d_busy_i, From d084c03d8bf158ce38fb91230359a6a71182f69c Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Tue, 26 May 2026 10:24:02 +0200 Subject: [PATCH 29/37] [MISC] Update spatz revision and add a test configuration (used to explore settings). --- Bender.lock | 2 +- config/cachepool_4g.mk | 102 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 config/cachepool_4g.mk diff --git a/Bender.lock b/Bender.lock index cff1d5f..e4ec6e9 100644 --- a/Bender.lock +++ b/Bender.lock @@ -150,7 +150,7 @@ packages: dependencies: - tech_cells_generic spatz: - revision: 61bc09f805db7f7ae6c1ab99d3da63b0a8431281 + revision: 08847c5fcc2dfe2427c70076d5970de24d54af4c version: null source: Git: https://github.com/pulp-platform/spatz.git diff --git a/config/cachepool_4g.mk b/config/cachepool_4g.mk new file mode 100644 index 0000000..8b1b300 --- /dev/null +++ b/config/cachepool_4g.mk @@ -0,0 +1,102 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Diyou Shen, ETH Zurich + +######################### +## CachePool Cluster ## +######################### + +# Number of groups +num_groups ?= 4 + +# 2×2 mesh +num_groups_x ?= 2 + +# Number of tiles +num_tiles_per_group ?= 4 + +# Number of cores +num_cores_per_tile ?= 4 + +# Core datawidth +data_width ?= 32 + +# Core addrwidth +addr_width ?= 32 + +num_remote_ports_per_tile ?= 4 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 4 + + +###################### +## CachePool Tile ## +###################### + +# Refill interconnection data width +refill_data_width ?= 128 + +##### L1 Data Cache ##### + +# L1 data cacheline width (in Bit) +l1d_cacheline_width ?= 512 + +# L1 data cache banking factor (how many banks per core?) +l1d_bank_factor ?= 1 + +# L1 coalecsing window +l1d_coal_window ?= 2 + +# L1 data cache number of ways per +l1d_num_way ?= 4 + +# L1 data cache size per tile (KiB) +l1d_tile_size ?= 256 + +# L1 data cache tag width (TODO: should be calcualted) +l1d_tag_data_width ?= 92 + +#################### +## CachePool CC ## +#################### +# Spatz fpu support? +spatz_fpu_en ?= 0 + +# Spatz number of FPU +spatz_num_fpu ?= 0 + +# Spatz number of IPU +spatz_num_ipu ?= 4 + +# Spatz max outstanding transactions +spatz_max_trans ?= 32 + +# Snitch/FPU max outstanding transactions +snitch_max_trans ?= 16 + + +##################### +## L2 Main Memory ## +##################### +# L2 number of channels +l2_channel ?= 8 + +# L2 bank width (DRAM width, change with care) +l2_bank_width ?= 512 + +# L2 interleaving factor (in order of bank_width) +l2_interleave ?= 16 + + +################## +## Peripherals ## +################## +# Hardware stack size (in Byte) +stack_hw_size ?= 1024 + +# Stack size (total, including share and private, 32'h800) +stack_tot_size ?= 2048 From c94601534415847228bebe370a8a6eb6105dbc01 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Wed, 27 May 2026 10:07:35 +0200 Subject: [PATCH 30/37] [SRC] Remove a not-yet-merged option for cache controller. --- hardware/src/cachepool_tile.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index ece3900..1365b8b 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -1108,7 +1108,7 @@ module cachepool_tile .CacheLineWidth (L1LineWidth ), .SetAssociativity (L1AssoPerCtrl ), .BankFactor (L1BankFactor ), - .LogDebug (0 ), + // .LogDebug (0 ), .RefillDataWidth (RefillDataWidth ), // Type .core_meta_t (tcdm_user_t ), @@ -1481,7 +1481,7 @@ module cachepool_tile .FILL_DW ( AxiDataWidth ), .EARLY_LATCH ( 0 ), .L0_EARLY_TAG_WIDTH ( snitch_pkg::PAGE_SHIFT - $clog2(ICacheLineWidth/8) ), - .ISO_CROSSING ( 1'b0 ), + .ISO_CROSSING ( 1'b1 ), .axi_req_t ( axi_mst_tile_wide_req_t ), .axi_rsp_t ( axi_mst_tile_wide_resp_t ), .sram_cfg_data_t ( impl_in_t ), From fc19f10959b877016b9696485c0a2d7080dfa271 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Wed, 27 May 2026 14:11:48 +0200 Subject: [PATCH 31/37] [SRC] Typo fix. --- hardware/src/cachepool_pkg.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index 655509e..7029bed 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -481,7 +481,7 @@ package cachepool_pkg; } noc_group_rsp_t; // Group ICache (L2 read-only cache control) - localparam int unsigned ROCacheNumAddrRules = 4; + localparam int unsigned ROCacheNumAddrRules = 1; typedef struct packed { logic enable; logic flush_valid; From 730f652cc48343499b05ced18ad7a812cbf9be3e Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 28 May 2026 15:41:07 +0200 Subject: [PATCH 32/37] [BUG][SRC] Fix a bug in flushing protection. Add assertion to find illegal memory access to cache. --- hardware/src/cachepool_tile.sv | 74 ++++++++++++++++++------------ hardware/src/tcdm_cache_interco.sv | 19 ++++++++ 2 files changed, 63 insertions(+), 30 deletions(-) diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index 1365b8b..0fa53fb 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -413,6 +413,12 @@ module cachepool_tile tcdm_req_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_req, cache_xbar_req; tcdm_rsp_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_rsp, cache_xbar_rsp; + // Post-xbar gated copies. + // cache_ctrl_req : xbar output with q_valid suppressed during flush. + // cache_bank_rsp : raw response from the bank/AMO stage; q_ready is gated before + // being returned to the interco as cache_xbar_rsp. + tcdm_req_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_ctrl_req; + tcdm_rsp_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_bank_rsp; tcdm_req_t [NumL1CtrlTile-1:0] cache_amo_req; tcdm_rsp_t [NumL1CtrlTile-1:0] cache_amo_rsp; @@ -512,16 +518,14 @@ module cachepool_tile always_comb begin : cache_flush_protection for (int j = 0; unsigned'(j) < NrTCDMPortsCores; j++) begin /***** REQ *****/ - // Wire to Cache outputs unmerge_req[j].q = tcdm_req[j].q; - // invalidate the request when cache is busy - unmerge_req[j].q_valid = tcdm_req[j].q_valid && !l1d_busy_i; + unmerge_req[j].q_valid = tcdm_req[j].q_valid; unmerge_pready[j] = 1'b1; /***** RSP *****/ tcdm_rsp[j].p = unmerge_rsp[j].p; tcdm_rsp[j].p_valid = unmerge_rsp[j].p_valid; - tcdm_rsp[j].q_ready = unmerge_rsp[j].q_ready && !l1d_busy_i; + tcdm_rsp[j].q_ready = unmerge_rsp[j].q_ready; end end @@ -546,19 +550,15 @@ module cachepool_tile // where j is the xbar index and r is the remote slot within that xbar. logic [NumRemotePortTile-1:0] remote_out_pready, remote_in_pready; - // Flush protection for remote ports. - // - // During a flush (l1d_busy_i) remote tiles must be fully stalled: - // - q_valid gated : stops new requests being presented to the xbar - // - q_ready gated : stops the xbar accepting a request that is already - // sitting at the input (spill register would otherwise - // pop it, and the transaction would be lost because the - // cache is unavailable) - // - remote_in_pready gated : stops response-ready from propagating back, - // preventing in-flight completions during the flush window + // Intra-group remote port wiring. + // q_valid and q_ready for incoming requests are passed through without gating: + // the after-xbar flush gate (cache_xbar_flush_gate) provides the authoritative + // protection at the cache bank boundary and naturally back-pressures through + // the interco to the remote sender. + // response-ready (remote_in_pready) is still gated to prevent draining in-flight + // completions during the flush window. tcdm_req_t [NumRemotePortTile-1:0] remote_req_gated; - // Intermediate response signals from the xbar before q_ready gating. tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_xbar; always_comb begin : remote_flush_protection @@ -566,14 +566,10 @@ module cachepool_tile for (int r = 0; r < NumRemotePortCore; r++) begin automatic int unsigned flat = j + r * NrTCDMPortsPerCore; - // Gate q_valid: prevent new requests entering the xbar. remote_req_gated[flat].q = remote_req_i[flat].q; - remote_req_gated[flat].q_valid = remote_req_i[flat].q_valid && !l1d_busy_i; + remote_req_gated[flat].q_valid = remote_req_i[flat].q_valid; - // Pass the full xbar response through, then gate only q_ready so the - // remote tile cannot complete a handshake during a flush. remote_rsp_o[flat] = remote_rsp_xbar[flat]; - remote_rsp_o[flat].q_ready = remote_rsp_xbar[flat].q_ready && !l1d_busy_i; // Gate response-ready back to us: prevent draining completions // of requests that arrived just before the flush. @@ -614,7 +610,10 @@ module cachepool_tile automatic int unsigned flat = j + r * NrTCDMPortsPerCore; // ----------------------------------------------------------- - // Incoming: REQRSP → TCDM conversion + flush gating → interco + // Incoming: REQRSP → TCDM conversion → interco + // q_valid and q_ready are passed through without gating; the + // after-xbar flush gate (cache_xbar_flush_gate) is the authoritative + // protection point and naturally back-pressures through the interco. // ----------------------------------------------------------- rg_interco_in_req[flat] = '{ q: '{ @@ -632,7 +631,7 @@ module cachepool_tile }, default: '0 }, - q_valid: remote_group_req_i[flat].q_valid && !l1d_busy_i, + q_valid: remote_group_req_i[flat].q_valid, default: '0 }; @@ -652,7 +651,7 @@ module cachepool_tile default: '0 }, p_valid: rg_interco_in_rsp[flat].p_valid, - q_ready: rg_interco_in_rsp[flat].q_ready && !l1d_busy_i, + q_ready: rg_interco_in_rsp[flat].q_ready, default: '0 }; @@ -850,9 +849,9 @@ module cachepool_tile ) i_cache_amo ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .core_req_i (cache_xbar_req [j][cb] ), + .core_req_i (cache_ctrl_req [j][cb] ), .core_rsp_ready_i (cache_xbar_pready[j][cb] ), - .core_rsp_o (cache_xbar_rsp [j][cb] ), + .core_rsp_o (cache_bank_rsp [j][cb] ), .mem_req_o (cache_amo_req [cb] ), .mem_rsp_ready_o (cache_amo_pready [cb] ), .mem_rsp_i (cache_amo_rsp [cb] ) @@ -915,9 +914,9 @@ module cachepool_tile ) i_spill_reg_cache_req ( .clk_i , .rst_ni ( rst_ni ), - .valid_i ( cache_xbar_req[j][cb].q_valid ), - .ready_o ( cache_xbar_rsp[j][cb].q_ready ), - .data_i ( cache_xbar_req[j][cb].q ), + .valid_i ( cache_ctrl_req[j][cb].q_valid ), + .ready_o ( cache_bank_rsp[j][cb].q_ready ), + .data_i ( cache_ctrl_req[j][cb].q ), .valid_o ( cache_req_reg.q_valid ), .ready_i ( cache_rsp_reg.q_ready ), .data_o ( cache_req_reg.q ) @@ -932,9 +931,9 @@ module cachepool_tile .valid_i ( cache_rsp_reg.p_valid ), .ready_o ( cache_rsp_ready[cb][j] ), .data_i ( cache_rsp_reg.p ), - .valid_o ( cache_xbar_rsp[j][cb].p_valid ), + .valid_o ( cache_bank_rsp[j][cb].p_valid ), .ready_i ( cache_xbar_pready[j][cb] ), - .data_o ( cache_xbar_rsp[j][cb].p ) + .data_o ( cache_bank_rsp[j][cb].p ) ); assign cache_req_valid[cb][j] = cache_req_reg.q_valid; @@ -954,6 +953,21 @@ module cachepool_tile end end + // Post-xbar flush gate (applied uniformly across all ports). + // Suppresses q_valid going into the bank so no new cache accesses are processed + // while a flush is in progress, and gates q_ready going back to the interco so the + // xbar cannot dequeue a buffered request that is already sitting at its output. + always_comb begin : cache_xbar_flush_gate + for (int j = 0; j < NrTCDMPortsPerCore; j++) begin + for (int cb = 0; cb < NumL1CtrlTile; cb++) begin + cache_ctrl_req[j][cb] = cache_xbar_req[j][cb]; + cache_ctrl_req[j][cb].q_valid = cache_xbar_req[j][cb].q_valid && !l1d_busy_i; + cache_xbar_rsp[j][cb] = cache_bank_rsp[j][cb]; + cache_xbar_rsp[j][cb].q_ready = cache_bank_rsp[j][cb].q_ready && !l1d_busy_i; + end + end + end + // Refill address inverse rotation parameters. // Must mirror the bits_to_rotate table in tcdm_cache_interco gen_scramble: // All-private or half-half private banks (cb < NumL1CtrlTile/2): diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv index 199edd3..5287208 100644 --- a/hardware/src/tcdm_cache_interco.sv +++ b/hardware/src/tcdm_cache_interco.sv @@ -38,6 +38,7 @@ // NumRemoteGroupPort == 0. `include "common_cells/registers.svh" +`include "common_cells/assertions.svh" module tcdm_cache_interco #( /// Number of Tiles ('>= 1') @@ -63,6 +64,8 @@ module tcdm_cache_interco #( /// tile ID which encodes both group and tile-within-group: /// tile_id = {group_id, local_tile_id} parameter int unsigned TileIDWidth = 32'd1, + /// DRAM base address, used to check if we get illegal access + parameter int unsigned DramBaseAddr = 32'h8000_0000, /// Number of tiles within a single group. /// Used to extract the group portion from the address tile field: /// group_id = addr_tile_bits / NumTilesPerGroup @@ -526,5 +529,21 @@ module tcdm_cache_interco #( assign mem_rsp_ready_o = mem_rsp_ready; + // ------------------------------------------------------------------------- + // Assertions + // ------------------------------------------------------------------------- +`ifndef TARGET_SYNTHESIS + // This is used to ensure we will not have illegal visits to DRAM + // This kind of error can be latent in the system until the entry is evicted + for (genvar x = 0; x < TotInPorts; x++) begin : gen_addr_assert + CoreReqAddrAboveDram: assert property ( + @(posedge clk_i) disable iff (!rst_ni !== '0) + core_req_i[x].q_valid |-> core_req_i[x].q.addr >= addr_t'(DramBaseAddr) + ) else begin + $error("[%m] port %0d: addr 0x%08x is below DramBaseAddr 0x%08x", + x, core_req_i[x].q.addr, DramBaseAddr); + end + end +`endif endmodule From ac62333203830ac47566fddb892b1be4ebd9c41f Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Thu, 28 May 2026 16:12:26 +0200 Subject: [PATCH 33/37] [CI] change wget to curl because of CI env change. --- sim/sim.mk | 2 +- toolchain.mk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sim/sim.mk b/sim/sim.mk index 9b106b5..80342e8 100644 --- a/sim/sim.mk +++ b/sim/sim.mk @@ -82,7 +82,7 @@ ${SIM_DIR}/${DPI_LIB}/cachepool_dpi.so: ${dpi_target} # ----------------- ${WORK_DIR}/${FESVR_VERSION}_unzip: mkdir -p $(dir $@) - wget -O $(dir $@)/${FESVR_VERSION} https://github.com/riscv/riscv-isa-sim/tarball/${FESVR_VERSION} + curl -fL -o $(dir $@)/${FESVR_VERSION} https://github.com/riscv/riscv-isa-sim/tarball/${FESVR_VERSION} tar xfm $(dir $@)${FESVR_VERSION} --strip-components=1 -C $(dir $@) touch $@ diff --git a/toolchain.mk b/toolchain.mk index 2d40a65..97e58f8 100644 --- a/toolchain.mk +++ b/toolchain.mk @@ -69,7 +69,7 @@ ${TOOLCHAIN_DIR}/riscv-isa-sim: ${TOOLCHAIN_DIR}/riscv-isa-sim.version ${TOOLCHAIN_DIR}/dtc: mkdir -p ${TOOLCHAIN_DIR}/dtc - cd ${TOOLCHAIN_DIR}/dtc && wget -c https://git.kernel.org/pub/scm/utils/dtc/dtc.git/snapshot/dtc-1.7.0.tar.gz + cd ${TOOLCHAIN_DIR}/dtc && curl -fLO https://git.kernel.org/pub/scm/utils/dtc/dtc.git/snapshot/dtc-1.7.0.tar.gz cd ${TOOLCHAIN_DIR}/dtc && tar xf dtc-1.7.0.tar.gz # ---------- Build toolchains ---------- From 997cb3079fdf6d35bfe59e6c149f79da05b13819 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 1 Jun 2026 10:53:03 +0200 Subject: [PATCH 34/37] [SW] Change cache flushing routine for preventing racing condition and triggering corner case bugs. --- README.md | 2 +- software/snRuntime/README.md | 216 +++++++++++++++--- software/snRuntime/include/l1cache.h | 3 + software/snRuntime/src/l1cache.c | 81 +++++-- .../src/platforms/standalone/start_snitch.S | 2 +- software/tests/bandwidth/main.c | 12 +- software/tests/byte-enable/main.c | 8 +- software/tests/fdotp-32b/main.c | 8 +- software/tests/fft-32b/main.c | 21 +- software/tests/fmatmul-32b/main.c | 16 +- software/tests/gemv-opt/main.c | 7 +- software/tests/gemv/main.c | 23 +- software/tests/idotp-32b/main.c | 6 +- software/tests/load-store/main.c | 62 ++--- software/tests/mcs-lock/main.c | 11 +- .../main.c | 11 +- 16 files changed, 326 insertions(+), 163 deletions(-) diff --git a/README.md b/README.md index d00a967..8b2d101 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ All tiles across all groups share one unified L1 data cache, interleaved across ## Requirements -- Linux environment with: `make`, `git`, `python3`, `wget`, `curl` +- Linux environment with: `make`, `git`, `python3`, `curl` - **CMake ≥ 3.28**, **GCC/G++ ≥ 11.2** - **QuestaSim** (tested with `questa-2023.4`) - Optional: SpyGlass for lint diff --git a/software/snRuntime/README.md b/software/snRuntime/README.md index d19e74e..04fff52 100644 --- a/software/snRuntime/README.md +++ b/software/snRuntime/README.md @@ -1,48 +1,198 @@ -# Snitch Runtime Library +# snRuntime — CachePool Software Runtime + +This library is the bare-metal software runtime for the CachePool manycore system. It is derived from the upstream Snitch runtime and extended with CachePool-specific cache management and peripheral APIs. + +## Folder Structure + +``` +snRuntime/ +├── include/ # Public headers — include these in application code +│ ├── snrt.h # Master header: topology, barriers, DMA, allocation +│ ├── l1cache.h # CachePool L1 data cache management API +│ ├── cachepool_peripheral.h # Register offsets for the cluster peripheral +│ ├── perf_cnt.h # Performance counter API +│ ├── team.h # Team/cluster descriptor structs +│ ├── interface.h # Hardware interface definitions +│ ├── debug.h # Debug printf helpers +│ ├── dm.h # Data-mover (DMA) low-level interface +│ ├── eu.h # Execution unit (work dispatch) interface +│ ├── kmp.h # OpenMP KMP interface +│ └── omp.h # OpenMP runtime interface +├── src/ # Runtime implementation +│ ├── start.S # Entry point (hart 0 boots, others wait for IPI) +│ ├── team.c # Team/topology initialisation +│ ├── barrier.c # Hardware and software barrier implementations +│ ├── l1cache.c # CachePool L1 cache management (flush, partition, xbar) +│ ├── alloc.c # L1 TCDM bump allocator + DRAM linked-list allocator +│ ├── memcpy.c # Optimised memcpy +│ ├── perf_cnt.c # Performance counter helpers +│ ├── printf.c # Lightweight printf (wraps vendor/printf.c) +│ ├── dm.c / dma.c # DMA engine helpers +│ ├── interrupt.c # Interrupt initialisation +│ └── platforms/ # Platform-specific startup and putchar +├── tests/ # Self-contained runtime unit tests +├── vendor/ # Third-party sources (printf, riscv-opcodes) +└── link/ # Linker script template (common.ld.in) +``` + +## Key API + +### Topology (`snrt.h`) + +```c +uint32_t snrt_cluster_core_idx(); // Core index within the cluster (0-based) +uint32_t snrt_cluster_core_num(); // Total cores in the cluster +uint32_t snrt_cluster_tile_idx(); // Tile index within the cluster +uint32_t snrt_cluster_tile_num(); // Number of tiles in the cluster +int snrt_is_compute_core(); // Non-zero if this is a compute (non-DMA) core +``` + +### Synchronisation (`snrt.h`) + +```c +void snrt_cluster_hw_barrier(); // Hardware barrier: stalls until all cluster cores arrive +void snrt_cluster_sw_barrier(); // Software barrier (polling) +void snrt_global_barrier(); // Cluster-to-cluster barrier +``` + +### L1 Data Cache — CachePool-specific (`l1cache.h`) + +All **cluster-wide** functions must be called by **every core** in the cluster. They +internally issue a `fence`, a hardware barrier, execute the operation on core 0 only, +and then issue a final barrier before returning. The low-level single-core variants +(without the `_cluster_` prefix) are for use inside the runtime or in single-core +contexts only. + +#### Cluster-wide flush (recommended for application code) + +```c +void l1d_cluster_flush(); // Flush all banks in all tiles +void l1d_cluster_shared_flush(); // Flush shared banks only +void l1d_cluster_private_flush(uint32_t tile); // Flush private banks of selected tiles (one-hot mask) +``` + +#### Cache configuration (cluster-wide) + +```c +// Set the crossbar interleaving offset (in bits). +// Granularity is clamped to >= log2(cacheline_bytes). +// Example: l1d_xbar_config(6) for 512-bit cachelines (6 = log2(64)). +void l1d_xbar_config(uint32_t offset); + +// Set the number of private banks per tile (0=all-shared … 4=all-private). +void l1d_part(uint32_t size); +``` + +#### Address boundary and polling + +```c +// Set the private/shared address boundary (default 0xA000_0000). +// Addresses >= boundary are private; addresses < boundary are shared. +// Requires a flush before changing while valid data is cached. +void l1d_addr(uint32_t addr); + +// Poll the peripheral until the current flush instruction completes. +// Used by the low-level flush functions; not normally needed in application code. +void l1d_wait(); +``` + +#### Cache initialisation (called once at boot, single-core) + +```c +// Invalidate all cache banks (insn = 2'b11). Called from start_snitch.S. +void l1d_init(uint32_t size); +``` + +### Performance Counters (`perf_cnt.h`) *TODO: REMOVE* + +```c +void snrt_start_perf_counter(enum snrt_perf_cnt, enum snrt_perf_cnt_type, uint32_t hart_id); +void snrt_stop_perf_counter(enum snrt_perf_cnt); +void snrt_reset_perf_counter(enum snrt_perf_cnt); +uint32_t snrt_get_perf_counter(enum snrt_perf_cnt); +``` + +Counter types include cycles, TCDM accesses, TCDM congestion, FPU issues, retired +instructions, DMA bandwidth events, and ICache statistics. + +### Memory Allocation (`snrt.h`) + +Two allocators are provided for different memory regions. + +**L1 TCDM — bump allocator** (no free support): + +```c +void *snrt_l1alloc(size_t size); // Bump-allocate from cluster TCDM scratchpad +void snrt_l1alloc_reset(); // Reclaim all L1 allocations at once +``` + +**DRAM — linked-list allocator** (single-core, supports free + coalescing): + +```c +void *snrt_malloc(size_t size); // Allocate from DRAM; payload rounded up to 64 B +void snrt_free(void *ptr); // Free and coalesce with following free blocks +``` -This library implements a minimal runtime for Snitch systems, which is responsible for the following: +Both the block header and the payload are cacheline-aligned (64 bytes). A request for +any size — even 1 byte — allocates a minimum of 64 bytes of payload. The allocator +must be called by a **single core only**; it is not thread-safe by design since +allocation is expected to happen in single-core initialisation phases. -- Detecting the hardware configuration (cores, clusters, ISA extensions, TCDM) -- Passing a descriptor struct to the executable -- Synchronization across cores and clusters -- Team-based multithreading and work splitting +The heap begins at `_edram + l3off` (set in `snrt_alloc_init`) and grows upward. +Block headers (64 bytes each) are stored in DRAM immediately before their payloads +and are accessed through the L1 cache like any other data. -## General Runtime +### DMA (`snrt.h`) *TODO: REMOVE* -The general runtime (`libsnRuntime`) relies on a bootloader or operating system to load the executable. This usually requires virtual memory to map the segments to the correct addresses. The general runtime does not provide any startup code in this scenario, but is more like a regular library providing some useful API. +```c +snrt_dma_txid_t snrt_dma_start_1d(void *dst, const void *src, size_t size); +snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, size_t size, + size_t dst_stride, size_t src_stride, size_t repeat); +void snrt_dma_wait(snrt_dma_txid_t tid); +void snrt_dma_wait_all(); +``` -## Bare Runtime +## Typical Initialisation Pattern -The bare runtimes (`libsnRuntime-`) assumes that the executable it is being linked into will run in a bare-metal fashion with no convenient bootloader or virtual memory setup. For this scenario, the runtime provides the `_start` symbol and implements a basic crt0. +```c +#include +#include -## Usage +int main() { + const uint32_t cid = snrt_cluster_core_idx(); -The runtime library can be compiled as follows: + // Configure cache xbar and partition — must be called by ALL cores. + l1d_xbar_config(6); // interleave at cacheline granularity + l1d_part(0); // all-shared - mkdir build - cd build - cmake .. - make + // Single-core init: allocate buffers, set up data structures. + if (cid == 0) { + float *buf = (float *)snrt_malloc(N * sizeof(float)); + // ... populate buf, other setup ... + } + snrt_cluster_hw_barrier(); -The tests can be executed as follows: + // ... parallel computation ... - make test + // Flush before reading results back — must be called by ALL cores. + l1d_cluster_flush(); -Interesting CMake options that can be set via `-D