diff --git a/.gitignore b/.gitignore index 8265b48..5a930ee 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,9 @@ hardware/deps/* *.tdb util/lint/sg_projects util/lint/tmp +hardware/bootrom/bootdata.cc +hardware/bootrom/bootdata_bootrom.cc +hardware/bootrom/bootrom.bin +hardware/bootrom/bootrom.dump +hardware/bootrom/bootrom.elf +hardware/bootrom/bootrom.sv diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 567242b..b54b177 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,29 +8,32 @@ variables: GIT_SUBMODULE_STRATEGY: none ROOT_DIR: '$CI_PROJECT_DIR' APPS: "tests" - PATH: '/home/gitlabci/.cargo/bin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/condor/bin:/usr/sepp/bin:$CI_PROJECT_DIR/install/verilator/bin:/home/gitlabci/.local/bin' + PATH: '$HOME/.cargo/bin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/condor/bin:/usr/sepp/bin:$CI_PROJECT_DIR/install/verilator/bin:$HOME/.local/bin' OBJCACHE: '' CC: '/usr/pack/gcc-11.2.0-af/linux-x64/bin/gcc' CXX: '/usr/pack/gcc-11.2.0-af/linux-x64/bin/g++' CMAKE: 'cmake-3.28.3' python: 'python3' python3: 'python3' + # Config to build and test + CI_CONFIG: 'cachepool_fpu_2g' + SW_PREFIX: 'test-cachepool-' default: - tags: [dolent] + tags: [shared] stages: - build + - test -.base: - artifacts: - when: always - expire_in: 1 day - -build-vsim: - extends: .base +# --------------------------------------------------------------------------- +# Build stage: compile RTL and software for CI_CONFIG. +# Parallel jobs within the same pipeline share $HOME, so the toolchain +# installed by make quick-tool is automatically available to all test jobs. +# --------------------------------------------------------------------------- +build: stage: build - timeout: 5h + timeout: 4h 30m script: - echo "Using CC=$CC" - echo "Using CXX=$CXX" @@ -39,10 +42,57 @@ build-vsim: - make quick-tool - make init - make dram-build - - cd util/auto-benchmark - - chmod +x ./run_ci.sh - - ./run_ci.sh + - python3 -m pip install --quiet -r requirements.txt + - make clean generate vsim config=$CI_CONFIG + artifacts: + when: always + expire_in: 2h + paths: + # QuestaSim compiled work library + - sim/work/ + # vsim wrapper scripts (exclude sim/bin/logs/ — not needed by test jobs) + - sim/bin/cachepool_cluster.vsim + - sim/bin/cachepool_cluster.vsim.gui + # DPI shared library + - sim/work-dpi/ + # Software binaries for all kernels + - software/build/CachePoolTests/ + # DRAMSys shared libraries and config files (referenced by vsim at runtime) + - hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/build/lib/ + - hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/configs/ +# --------------------------------------------------------------------------- +# Test stage: run each kernel in parallel on a separate runner. +# Each job downloads the build artifacts, runs one simulation, and checks +# the output log for failures. +# --------------------------------------------------------------------------- +test: + stage: test + timeout: 1h + needs: [build] + parallel: + matrix: + - KERNEL: + - spin-lock + - load-store_M16 + - fdotp-32b_M32768 + - gemv_M512_N128_K32 + - fmatmul-32b_M64_N64_K64 + - fft-32b_M1024_N16 + - multi_producer_single_consumer_double_linked_list_M1_N1350_K10 + - byte-enable + script: + # The vsim script writes a .rtlbinary marker here; ensure the dir exists. + - mkdir -p sim/bin/logs + - chmod +x sim/bin/cachepool_cluster.vsim + - BIN="${SW_PREFIX}${KERNEL}" + - sim/bin/cachepool_cluster.vsim software/build/CachePoolTests/$BIN 2>&1 | tee test_${KERNEL}.log + - python3 util/auto-benchmark/check-ci.py test_${KERNEL}.log artifacts: + when: always + expire_in: 1 week paths: - - util/auto-benchmark/logs + # Full simulation log + - test_*.log + # Performance-monitor trace files written by the simulator + - sim/bin/logs/ diff --git a/Bender.lock b/Bender.lock index 9684f42..62bfa1f 100644 --- a/Bender.lock +++ b/Bender.lock @@ -16,17 +16,34 @@ packages: - common_verification - tech_cells_generic axi_riscv_atomics: - revision: 97dcb14ef057cbe5bd70dda2060b5bb9e7e04c6d - version: 0.7.0 + revision: 97a1dd2ac643c276880420a0cf8eea697f228aa9 + version: 0.8.3 source: Git: https://github.com/pulp-platform/axi_riscv_atomics.git dependencies: - axi - common_cells - common_verification + axi_stream: + revision: 54891ff40455ca94a37641b9da4604647878cc07 + version: 0.1.1 + source: + Git: https://github.com/pulp-platform/axi_stream.git + dependencies: + - common_cells + cluster_icache: + revision: ce0ed94a5b95f5c76b9fa51940303fcce53f56e5 + version: null + source: + Git: https://github.com/pulp-platform/cluster_icache.git + dependencies: + - axi + - common_cells + - scm + - tech_cells_generic common_cells: - revision: 9afda9abb565971649c2aa0985639c096f351171 - version: 1.38.0 + revision: 9ca8a7655f741e7dd5736669a20a301325194c28 + version: 1.39.0 source: Git: https://github.com/pulp-platform/common_cells.git dependencies: @@ -45,8 +62,27 @@ packages: Git: https://github.com/pulp-platform/dram_rtl_sim.git dependencies: - axi + floo_noc: + revision: 97306733f33acbb646c7e403c03a674fc1404b44 + version: null + source: + Git: https://github.com/pulp-platform/FlooNoC.git + dependencies: + - axi + - axi_riscv_atomics + - common_cells + - common_verification + - floo_noc_pd + - fpnew + - idma + floo_noc_pd: + revision: null + version: null + source: + Path: hardware/deps/floo_noc/./pd + dependencies: [] fpnew: - revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 + revision: e5aa6a01b5bbe1675c3aa8872e1203413ded83d1 version: null source: Git: https://github.com/pulp-platform/cvfpu.git @@ -61,14 +97,16 @@ packages: dependencies: - common_cells idma: - revision: b31e8f019c657eff4126bc789f0336d403da6766 - version: 0.4.2 + revision: 28a36e5e07705549e59fc33db96ab681bc1ca88e + version: 0.6.5 source: Git: https://github.com/pulp-platform/iDMA.git dependencies: - axi + - axi_stream - common_cells - common_verification + - obi - register_interface insitu-cache: revision: fa761ddebc946f9b46509d84945bf41ee1a9ec49 @@ -79,6 +117,14 @@ packages: - axi - common_cells - register_interface + obi: + revision: 0155fc34e900c7c884e081c0a1114a247937ff69 + version: 0.1.7 + source: + Git: https://github.com/pulp-platform/obi.git + dependencies: + - common_cells + - common_verification register_interface: revision: 146501d80052b61475cdc333d3aab4cd769fd5dc version: 0.3.9 @@ -96,8 +142,15 @@ packages: dependencies: - common_cells - tech_cells_generic + scm: + revision: 1976c7efb4979271eee2abe262fde0f9a20e2557 + version: 1.2.1 + source: + Git: https://github.com/pulp-platform/scm.git + dependencies: + - tech_cells_generic spatz: - revision: ed25c78dd72d839db8141287f9516d78ee399b93 + revision: 944e9042379fbe1db69567655cffd7a2e260ff16 version: null source: Git: https://github.com/pulp-platform/spatz.git diff --git a/Bender.yml b/Bender.yml index 45b01da..7873a05 100644 --- a/Bender.yml +++ b/Bender.yml @@ -10,13 +10,14 @@ dependencies: axi_riscv_atomics: { git: "https://github.com/pulp-platform/axi_riscv_atomics.git", version: 0.7.0 } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", version: 1.28.0 } FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 } - idma: { git: "https://github.com/pulp-platform/iDMA.git", version: 0.4.2 } register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: 0.3.8 } riscv-dbg: { git: "https://github.com/pulp-platform/riscv-dbg.git", version: 0.7.0 } tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.11 } Insitu-Cache: { git: "https://github.com/pulp-platform/Insitu-Cache.git", rev: zexin/cachepool_dev } - spatz: { git: "https://github.com/pulp-platform/spatz.git", rev: cachepool-32b } dram_rtl_sim: { git: "https://github.com/pulp-platform/dram_rtl_sim.git", rev: cachepool } + floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: main } + cluster_icache: { git: "https://github.com/pulp-platform/cluster_icache.git", rev: main } + spatz: { git: "https://github.com/pulp-platform/spatz.git", rev: cachepool-32b } workspace: checkout_dir: "./hardware/deps" @@ -29,22 +30,27 @@ sources: - hardware/src/tcdm_cache_interco.sv - hardware/src/tcdm_id_remapper.sv - hardware/src/spatz_cache_amo.sv + # FlooNoC + - hardware/generated/floo_cachepool_noc_pkg.sv # Memory-mapped register - hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv - hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv - hardware/cachepool_peripheral/cachepool_peripheral.sv # Bootrom - hardware/bootrom/bootrom.sv - # Barrier - - hardware/src/cachepool_tile_barrier.sv - - hardware/src/cachepool_cluster_barrier.sv # Level 1 - hardware/src/cachepool_pkg.sv - hardware/src/cachepool_cc.sv + # Barrier + - hardware/src/cachepool_tile_barrier.sv + - hardware/src/cachepool_cluster_barrier.sv + # ICache + - hardware/src/axi_hier_interco.sv # Level 2 - hardware/src/cachepool_tile.sv # Level 3 - hardware/src/cachepool_group.sv + - hardware/src/cachepool_group_noc_wrapper.sv - hardware/src/cachepool_cluster.sv # Level 4 diff --git a/Makefile b/Makefile index 588cf0f..9bd24f0 100644 --- a/Makefile +++ b/Makefile @@ -67,7 +67,7 @@ CACHE_PATH := $(shell [ -x "$(BENDER)" ] && $(BENDER) path insitu-cac # Configurations CFG_DIR ?= ${CACHEPOOL_DIR}/config -config ?= cachepool_512 +config ?= cachepool_fpu_2g # Compiler choice for SW cmake COMPILER ?= llvm @@ -163,6 +163,30 @@ $(BOOTROM_DIR)/bootrom.sv: $(BOOTROM_DIR)/bootrom.bin $(BOOTROM_DIR)/bootdata.cc ${PYTHON} $(SCRIPTS_DIR)/generate_bootrom.py \ $< -c $(HJSON_OUT) --output $@ +########### +# FlooNoC # +########### +FLOO_DIR ?= $(shell $(BENDER_INSTALL_DIR)/bender path floo_noc) +FLOO_GEN_OUTDIR ?= $(ROOT_DIR)/hardware/generated +FLOO_CFG ?= $(ROOT_DIR)/config/floonoc_cachepool_4g.yml +FLOO_NAME = cachepool +FLOO_NOC ?= $(FLOO_GEN_OUTDIR)/floo_$(FLOO_NAME)_noc_pkg.sv + +$(info FLOO_DIR: $(FLOO_DIR)) + +# Generates the sources for FlooNoC +.PHONY: update-floonoc install-floogen clean-floonoc +install-floogen: + pip install -e $(FLOO_DIR) --quiet + +update-floonoc: $(FLOO_NOC) +$(FLOO_NOC): install-floogen $(FLOO_CFG) + mkdir -p $(FLOO_GEN_OUTDIR) + PATH="$(HOME)/.local/bin:$(PATH)" floogen pkg -c $(FLOO_CFG) -o $(FLOO_GEN_OUTDIR) --no-format + +clean-floonoc: + rm -f $(FLOO_NOC) + ########### # DramSys # ########### @@ -232,33 +256,32 @@ VLOG_FLAGS += -64 VLOG_DEFS = -DCACHEPOOL # Cluster configuration +VLOG_DEFS += -DNUM_GROUPS=$(num_groups) +VLOG_DEFS += -DNUM_GROUPS_X=$(num_groups_x) VLOG_DEFS += -DNUM_TILES=$(num_tiles) VLOG_DEFS += -DNUM_CORES=$(num_cores) VLOG_DEFS += -DDATA_WIDTH=$(data_width) VLOG_DEFS += -DADDR_WIDTH=$(addr_width) # Tile configuration -VLOG_DEFS += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) VLOG_DEFS += -DREFILL_DATA_WIDTH=$(refill_data_width) # L1 Data Cache VLOG_DEFS += -DL1D_CACHELINE_WIDTH=$(l1d_cacheline_width) -VLOG_DEFS += -DL1D_SIZE=$(l1d_size) -VLOG_DEFS += -DL1D_BANK_FACTOR=$(l1d_bank_factor) VLOG_DEFS += -DL1D_COAL_WINDOW=$(l1d_coal_window) VLOG_DEFS += -DL1D_NUM_WAY=$(l1d_num_way) -VLOG_DEFS += -DL1D_TILE_SIZE=$(l1d_tile_size) VLOG_DEFS += -DL1D_TAG_DATA_WIDTH=$(l1d_tag_data_width) VLOG_DEFS += -DL1D_NUM_BANKS=$(l1d_num_banks) VLOG_DEFS += -DL1D_DEPTH=$(l1d_depth) # CachePool CC / core cluster -VLOG_DEFS += -DSPATZ_FPU_EN=$(spatz_fpu_en) VLOG_DEFS += -DSPATZ_NUM_FPU=$(spatz_num_fpu) VLOG_DEFS += -DSPATZ_NUM_IPU=$(spatz_num_ipu) VLOG_DEFS += -DSPATZ_MAX_TRANS=$(spatz_max_trans) VLOG_DEFS += -DSNITCH_MAX_TRANS=$(snitch_max_trans) VLOG_DEFS += -DREMOTE_PORT_PER_CORE=$(num_remote_ports_per_tile) +VLOG_DEFS += -DRG_PORT_PER_CORE=$(num_rg_ports_per_core) +VLOG_DEFS += -DNOC_PORT_PER_TILE=$(num_noc_ports_per_tile) # AXI configuration VLOG_DEFS += -DAXI_USER_WIDTH=$(axi_user_width) @@ -268,14 +291,12 @@ VLOG_DEFS += -DL2_CHANNEL=$(l2_channel) VLOG_DEFS += -DL2_BANK_WIDTH=$(l2_bank_width) VLOG_DEFS += -DL2_INTERLEAVE=$(l2_interleave) -# Peripherals / memory map -VLOG_DEFS += -DSTACK_ADDR=$(stack_addr) +# Stack / SPM (boot_addr, stack_addr, periph_start_addr, uart_addr used by hjson +# generator via environment; not consumed as SV defines) VLOG_DEFS += -DSTACK_HW_SIZE=$(stack_hw_size) VLOG_DEFS += -DSTACK_HW_DEPTH=$(stack_hw_depth) VLOG_DEFS += -DSTACK_TOT_SIZE=$(stack_tot_size) -VLOG_DEFS += -DPERIPH_START_ADDR=$(periph_start_addr) -VLOG_DEFS += -DBOOT_ADDR=$(boot_addr) -VLOG_DEFS += -DUART_ADDR=$(uart_addr) +VLOG_DEFS += -DSTACK_TOT_DEPTH=$(stack_tot_depth) ENABLE_CACHEPOOL_TESTS ?= 1 @@ -365,6 +386,9 @@ help: @echo "*generate*: generate the Spatz package and opcodes, and the cluster config HJSON" @echo "*cache-init*: source the insitu-cache environment (requires bender checkout)" @echo "*bootrom*: compile and generate the bootrom SystemVerilog module" + @echo "*update-floonoc*: regenerate FlooNoC package from FLOO_CFG (run after changing group count)" + @echo "*install-floogen*: install the floogen Python tool (required by update-floonoc)" + @echo "*clean-floonoc*: remove the generated FlooNoC package" @echo "" @echo "DRAMSys:" @echo "" diff --git a/README.md b/README.md index 48c83e9..8b2d101 100644 --- a/README.md +++ b/README.md @@ -13,15 +13,15 @@ CachePool is a Snitch–Spatz–based many-core system with a shared L1 data cac | Level | Module | Description | |-------|--------|-------------| | 1 | Core Complex (CC) | One 32-bit Snitch + one Spatz RVV accelerator | -| 2 | Tile | 4 CCs + 4 × 64 KiB 4-way InSitu-Cache banks | -| 3 | Group | 4 Tiles connected via crossbar | -| 4 | Cluster (WIP) | Multiple Groups connected via NoC (currently one Group) | +| 2 | Tile | 4 CCs + 4 × InSitu-Cache banks | +| 3 | Group | 4 Tiles connected via crossbar + shared L2 ICache | +| 4 | Cluster | Multiple Groups connected via FlooNoC XY mesh | -All tiles in a cluster share one unified L1 cache, interleaved across cache banks. The bank-selection offset is configurable at runtime via `l1d_xbar_config(...)`. +All tiles across all groups share one unified L1 data cache, interleaved across cache banks. The bank-selection offset is configurable at runtime via `l1d_xbar_config(...)`. ## Requirements -- Linux environment with: `make`, `git`, `python3`, `wget`, `curl` +- Linux environment with: `make`, `git`, `python3`, `curl` - **CMake ≥ 3.28**, **GCC/G++ ≥ 11.2** - **QuestaSim** (tested with `questa-2023.4`) - Optional: SpyGlass for lint @@ -68,10 +68,10 @@ make dram-build CMAKE=/path/to/cmake-3.28.x CC=/path/to/gcc-11.2 CXX=/path/to/g+ ### Generate Required RTL Some RTL components (e.g., package headers) must be generated prior to simulation. -Generation requires specifying a **configuration**. If none is provided, the default is `cachepool_512`. +Generation requires specifying a **configuration**. If none is provided, the default is `cachepool_2g`. ```bash -make generate config=cachepool_fpu_512 +make generate config=cachepool_fpu_2g ``` ### Build the BootROM @@ -79,7 +79,7 @@ make generate config=cachepool_fpu_512 The BootROM is built separately from the RTL generation step: ```bash -make bootrom config=cachepool_fpu_512 +make bootrom config=cachepool_fpu_2g ``` ### Compilation and Simulation @@ -87,13 +87,13 @@ make bootrom config=cachepool_fpu_512 #### Build Software Only ```bash -make sw config=cachepool_fpu_512 +make sw config=cachepool_fpu_2g ``` #### Build Hardware + Software (QuestaSim) ```bash -make vsim config=cachepool_fpu_512 +make vsim config=cachepool_fpu_2g ``` #### Run the Simulation @@ -125,7 +125,7 @@ A lightweight benchmarking automation flow is provided under `util/auto-benchmar 1. Edit `configs.sh` to list the desired configurations and kernels: - CONFIGS="cachepool_fpu_512 cachepool_fpu_256 cachepool_fpu_128" + CONFIGS="cachepool_fpu_2g cachepool_fpu_4g" KERNELS="fdotp-32b_M32768 ffft-64b_M16384 fmatmul-64b_M2048" PREFIX="test-cachepool-" ROOT_PATH=../.. @@ -147,10 +147,10 @@ A lightweight benchmarking automation flow is provided under `util/auto-benchmar Example directory after a run: logs/20251028-1230/ - ├── cachepool_fpu_512_fdotp-32b_M32768.log - ├── cachepool_fpu_512_fdotp-32b_M32768_pm/ - ├── cachepool_fpu_512_summary.txt - ├── cachepool_fpu_256_summary.txt + ├── cachepool_fpu_2g_fdotp-32b_M32768.log + ├── cachepool_fpu_2g_fdotp-32b_M32768_pm/ + ├── cachepool_fpu_2g_summary.txt + ├── cachepool_fpu_4g_summary.txt └── ... Each run includes: @@ -170,46 +170,45 @@ This setup allows quick reproducible benchmarks with all results neatly organize Usage: ```bash -python3 check_ci.py logs/latest/cachepool_fpu_512_load-store.log +python3 check_ci.py logs/latest/cachepool_fpu_2g_load-store.log ``` Exit code 0 means all tests passed; exit code 1 means at least one failure was detected. On failure the offending lines and their line numbers are printed for manual inspection. ## Configurations -All hardware knobs live in **`config/config.mk`** (and flavor files it includes). The default configuration is **4 tiles, 16 cores**. +All hardware knobs live in **`config/config.mk`** (and flavor files it includes). The default configuration is **2 groups, 4 tiles/group, 4 cores/tile = 32 cores total**. -| Flavor file | Description | -|-------------|-------------| -| `cachepool.mk` | No floating-point support | -| `cachepool_fpu.mk` | Enables single/half precision in the Spatz vector core | +Configuration names encode the number of groups and whether the FPU is enabled: -Available named configurations (passed as `config=`): - -| Name | Cacheline | FPU | -|------|-----------|-----| -| `cachepool_512` | 512b | No | -| `cachepool_128` | 128b | No | -| `cachepool_fpu_512` | 512b | Yes | -| `cachepool_fpu_256` | 256b | Yes | -| `cachepool_fpu_128` | 128b | Yes | +| Name | Groups | Mesh | FPU | Cores | +|------|--------|------|-----|-------| +| `cachepool_2g` | 2 | 1×2 | No | 32 | +| `cachepool_fpu_2g` | 2 | 1×2 | Yes | 32 | +| `cachepool_4g` | 4 | 2×2 | No | 64 | +| `cachepool_fpu_4g` | 4 | 2×2 | Yes | 64 | +| `cachepool_fpu_16g` | 16 | 4×4 | Yes | 256 | The Spatz cluster consumes **`config/cachepool.hjson`**, which is **generated** from: - `config/cachepool.hjson.tmpl` (skeleton with comments) - `config/config.mk` (source of truth) -To switch flavors, set `config=` (or export `CACHEPOOL_CONFIGURATION=`), then rebuild: +Multi-group configurations also require a FlooNoC topology file (e.g. `config/floonoc_cachepool_4g.yml`). After changing the group count, regenerate the FlooNoC package: ```bash -make clean -make generate config=cachepool_fpu_512 +make update-floonoc ``` -> `make clean` is recommended when changing configurations. +To switch configurations, always clean first: + +```bash +make clean +make generate config=cachepool_fpu_2g +``` ### How configuration flows -1. **`config/config.mk`** defines all parameters (e.g., `num_tiles`, `num_cores`, `l1d_cacheline_width`, `axi_user_width`, addresses, etc.). Derived values (like `axi_user_width`) are pre-computed so tools receive integers, not expressions. +1. **`config/config.mk`** defines all parameters (e.g., `num_groups`, `num_groups_x`, `num_tiles_per_group`, `num_cores_per_tile`, `l1d_cacheline_width`, `axi_user_width`, etc.). Derived values are pre-computed so tools receive integers, not expressions. 2. `make generate` calls the Python generator to produce **`config/cachepool.hjson`** from the template. 3. The Makefile passes the same values to **QuestaSim** via `VLOG_DEFS`, keeping RTL, sim, and HJSON in sync. @@ -319,7 +318,7 @@ Cluster peripherals (including the BootROM and memory-mapped registers) are inst SpyGlass lint (optional): ```bash -make lint config=cachepool_fpu_512 +make lint config=cachepool_fpu_2g ``` --- @@ -328,6 +327,7 @@ make lint config=cachepool_fpu_512 - To see the exact macros passed to vlog, check `VLOG_DEFS` in the Makefile and `sim/work/compile.vsim.tcl`. - If you change cacheline width, `AXI_USER_WIDTH` is derived (supported widths: 128→19, 256→18, 512→17). Unsupported widths error out at generation time. -- Use `make clean` when switching flavors/configs to prevent stale build artifacts. +- When changing the number of groups, run `make update-floonoc` to regenerate the FlooNoC package before `make generate`. +- Use `make clean` when switching configs to prevent stale build artifacts. - Runtime functions `snrt_tile_id()` and `snrt_num_tiles()` are available to query tile topology from software. - Changing the partition mode or boundary address while the cache holds valid data requires a flush (`l1d_flush()` or the appropriate partition flush) before reconfiguring. diff --git a/config/cachepool.hjson b/config/cachepool.hjson index 2ac0947..652b4b7 100644 --- a/config/cachepool.hjson +++ b/config/cachepool.hjson @@ -53,11 +53,11 @@ register_offload_rsp: true }, - nr_tiles: 4, + nr_tiles: 8, - // Repeat the compute core template N times (driven by 16) + // Repeat the compute core template N times (driven by 32) cores: [ - { $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" } + { $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" } ], icache: { diff --git a/config/cachepool_512.mk b/config/cachepool_2g.mk similarity index 87% rename from config/cachepool_512.mk rename to config/cachepool_2g.mk index 6d04a68..271eedb 100644 --- a/config/cachepool_512.mk +++ b/config/cachepool_2g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 2 + +# 1×2 mesh +num_groups_x ?= 1 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,14 +26,17 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 1 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 @@ -36,9 +45,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -48,7 +54,7 @@ l1d_coal_window ?= 2 # L1 data cache number of ways per l1d_num_way ?= 4 -# L1 data cache size **per tile** (KiB) +# L1 data cache size per tile (KiB) l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) diff --git a/config/cachepool_128.mk b/config/cachepool_4g.mk similarity index 85% rename from config/cachepool_128.mk rename to config/cachepool_4g.mk index df52dab..8b1b300 100644 --- a/config/cachepool_128.mk +++ b/config/cachepool_4g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 4 + +# 2×2 mesh +num_groups_x ?= 2 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,24 +26,24 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 4 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 4 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 ##### L1 Data Cache ##### # L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 128 - -# L1 data cache size (in KiB) -l1d_size ?= 256 +l1d_cacheline_width ?= 512 # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -52,7 +58,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 52 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## @@ -77,7 +83,7 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 8 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 diff --git a/config/cachepool_fpu_256.mk b/config/cachepool_fpu_16g.mk similarity index 84% rename from config/cachepool_fpu_256.mk rename to config/cachepool_fpu_16g.mk index 279dc80..8cf8445 100644 --- a/config/cachepool_fpu_256.mk +++ b/config/cachepool_fpu_16g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 16 + +# 4×4 mesh +num_groups_x ?= 4 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,24 +26,24 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 2 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 ##### L1 Data Cache ##### # L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 256 - -# L1 data cache size (in KiB) -l1d_size ?= 256 +l1d_cacheline_width ?= 512 # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -52,7 +58,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 64 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## @@ -77,13 +83,13 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 16 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 # L2 interleaving factor (in order of bank_width) -l2_interleave ?= 8 +l2_interleave ?= 16 ################## diff --git a/config/cachepool_fpu_128.mk b/config/cachepool_fpu_2g.mk similarity index 84% rename from config/cachepool_fpu_128.mk rename to config/cachepool_fpu_2g.mk index e60aad4..2689e8d 100644 --- a/config/cachepool_fpu_128.mk +++ b/config/cachepool_fpu_2g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 2 + +# 1×2 mesh +num_groups_x ?= 1 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,30 +26,30 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 1 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width -refill_data_width ?= 128 +refill_data_width ?= 512 ##### L1 Data Cache ##### # L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 128 - -# L1 data cache size (in KiB) -l1d_size ?= 256 +l1d_cacheline_width ?= 512 # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 # L1 coalecsing window -l1d_coal_window ?= 1 +l1d_coal_window ?= 2 # L1 data cache number of ways per l1d_num_way ?= 4 @@ -52,7 +58,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 64 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## diff --git a/config/cachepool_fpu_512.mk b/config/cachepool_fpu_4g.mk similarity index 88% rename from config/cachepool_fpu_512.mk rename to config/cachepool_fpu_4g.mk index 2e4c3ca..90a6af8 100644 --- a/config/cachepool_fpu_512.mk +++ b/config/cachepool_fpu_4g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 4 + +# 2×2 mesh +num_groups_x ?= 2 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,16 +26,17 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 -num_remote_ports_per_tile ?= 2 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 2 ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 @@ -38,9 +45,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -79,7 +83,7 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 8 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 diff --git a/config/config.mk b/config/config.mk index 9eee8cb..1164e89 100644 --- a/config/config.mk +++ b/config/config.mk @@ -26,13 +26,25 @@ include $(CACHEPOOL_DIR)/config/$(config).mk ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 1 + +# X dimension of the group mesh (Y = num_groups / num_groups_x) +num_groups_x ?= 1 + # Number of tiles -num_tiles ?= 1 +num_tiles_per_group ?= 4 +num_tiles = $(shell echo $$(( $(num_groups) * $(num_tiles_per_group)))) num_remote_ports_per_tile ?= 1 # Number of cores -num_cores ?= 4 +num_cores_per_tile ?= 4 +num_cores ?= $(shell echo $$(( $(num_tiles) * $(num_cores_per_tile)))) + +num_rg_ports_per_core ?= 0 + +num_noc_ports_per_tile ?= 1 # Core datawidth data_width ?= 32 @@ -45,9 +57,6 @@ addr_width ?= 32 ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 @@ -56,9 +65,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -116,6 +122,15 @@ endif ##################### ## L2 Main Memory ## ##################### + +# DRAM base address and size (hex: 0x8000_0000, 0x2000_0000) +dram_addr ?= 2147483648 +dram_len ?= 536870912 + +# Uncached region base address and size (hex: 0xC000_0000, 0x2000_0000) +uncached_addr ?= 3221225472 +uncached_len ?= 536870912 + # L2 number of channels l2_channel ?= 4 diff --git a/config/floonoc_cachepool_4g.yml b/config/floonoc_cachepool_4g.yml new file mode 100644 index 0000000..2c81e28 --- /dev/null +++ b/config/floonoc_cachepool_4g.yml @@ -0,0 +1,89 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +name: cachepool +description: "CachePool AXI NoC" +network_type: "axi" + +routing: + route_algo: "SRC" + use_id_table: true + +protocols: + - name: "wide_in" + type: "wide" + protocol: "AXI4" + data_width: 256 + addr_width: 32 + id_width: 2 + user_width: 1 + - name: "wide_out" + type: "wide" + protocol: "AXI4" + data_width: 256 + addr_width: 32 + id_width: 2 + user_width: 1 + +endpoints: + - name: "group" + array: [2, 2] + mgr_port_protocol: + - "wide_in" + - name: "hbm" + array: [4] + addr_range: + base: 0x8000_0000 + size: 0x0010_0000 + sbr_port_protocol: + - "wide_out" + - name: "host_peri" + addr_range: + - start: 0x0000_0000 + end: 0x7FFF_FFFF + - start: 0xA000_0000 + end: 0xC000_FFFF + mgr_port_protocol: + - "wide_in" + sbr_port_protocol: + - "wide_out" + +routers: + - name: "group_router" + array: [2, 2] + degree: 5 + +connections: + - src: "group" + dst: "group_router" + src_range: + - [0, 1] + - [0, 1] + dst_range: + - [0, 1] + - [0, 1] + dst_dir: "Eject" + # HBM West + - src: "hbm" + dst: "group_router" + src_range: + - [0, 1] + dst_range: + - [0, 0] + - [0, 1] + dst_dir: "West" + # HBM East + - src: "hbm" + dst: "group_router" + src_range: + - [2, 3] + dst_range: + - [1, 1] + - [0, 1] + dst_dir: "East" + # Special + - src: "host_peri" + dst: "group_router" + dst_idx: [0, 0] + dst_dir: "South" diff --git a/hardware/bootrom/bootdata.cc b/hardware/bootrom/bootdata.cc deleted file mode 100644 index f96b8ba..0000000 --- a/hardware/bootrom/bootdata.cc +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -#include - -namespace sim { - -const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 16, - .hartid_base = 0, - .tcdm_start = 0xbffff800, - .tcdm_size = 0x800, - .tcdm_offset = 0x0, - .global_mem_start = 0x80000000, - .global_mem_end = 0xa0000000, - .tile_count = 4}; - -} // namespace sim diff --git a/hardware/bootrom/bootdata_bootrom.cc b/hardware/bootrom/bootdata_bootrom.cc deleted file mode 100644 index d578d55..0000000 --- a/hardware/bootrom/bootdata_bootrom.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -#include - -// The boot data generated along with the system RTL. -struct BootData { - uint32_t boot_addr; - uint32_t core_count; - uint32_t hartid_base; - uint32_t tcdm_start; - uint32_t tcdm_size; - uint32_t tcdm_offset; - uint64_t global_mem_start; - uint64_t global_mem_end; - uint32_t tile_count; -}; - -extern "C" const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 16, - .hartid_base = 0, - .tcdm_start = 0xbffff800, - .tcdm_size = 0x800, - .tcdm_offset = 0x0, - .global_mem_start = 0x80000000, - .global_mem_end = 0xa0000000, - .tile_count = 4}; diff --git a/hardware/bootrom/bootrom.bin b/hardware/bootrom/bootrom.bin deleted file mode 100755 index d4a9322..0000000 Binary files a/hardware/bootrom/bootrom.bin and /dev/null differ diff --git a/hardware/bootrom/bootrom.dump b/hardware/bootrom/bootrom.dump deleted file mode 100644 index dad90e3..0000000 --- a/hardware/bootrom/bootrom.dump +++ /dev/null @@ -1,127 +0,0 @@ - -/scratch2/diyou/cachepool/ManyRVData/hardware/bootrom/bootrom.elf: file format elf32-littleriscv - - -Disassembly of section .text: - -00001000 <_start>: - 1000: 00000317 auipc t1,0x0 - 1004: 07832303 lw t1,120(t1) # 1078 <_GLOBAL_OFFSET_TABLE_+0x4> - 1008: 30531073 csrw mtvec,t1 - 100c: f1402573 csrr a0,mhartid - 1010: 00000597 auipc a1,0x0 - 1014: 06c5a583 lw a1,108(a1) # 107c <_GLOBAL_OFFSET_TABLE_+0x8> - 1018: 3047d073 csrwi mie,15 - 101c: 10500073 wfi - 1020: 00c5a383 lw t2,12(a1) - 1024: 0105ae03 lw t3,16(a1) - 1028: 01c383b3 add t2,t2,t3 - 102c: 02038393 addi t2,t2,32 - 1030: 0003a383 lw t2,0(t2) - 1034: 00038067 jr t2 - -00001038 : - 1038: 10500073 wfi - 103c: ffdff06f j 1038 - -Disassembly of section .rodata: - -00001040 : - 1040: 1000 .2byte 0x1000 - 1042: 0000 .2byte 0x0 - 1044: 0010 .2byte 0x10 - 1046: 0000 .2byte 0x0 - 1048: 0000 .2byte 0x0 - 104a: 0000 .2byte 0x0 - 104c: f800 .2byte 0xf800 - 104e: bfff .2byte 0xbfff - 1050: 0800 .2byte 0x800 - ... - 105a: 8000 .2byte 0x8000 - 105c: 0000 .2byte 0x0 - 105e: 0000 .2byte 0x0 - 1060: 0000 .2byte 0x0 - 1062: a000 .2byte 0xa000 - 1064: 0000 .2byte 0x0 - 1066: 0000 .2byte 0x0 - 1068: 0004 .2byte 0x4 - 106a: 0000 .2byte 0x0 - 106c: 0000 .2byte 0x0 - ... - -Disassembly of section .boot_section: - -00001070 : - 1070: 1038 .2byte 0x1038 - ... - -Disassembly of section .got: - -00001074 <_GLOBAL_OFFSET_TABLE_>: - 1074: 0000 .2byte 0x0 - 1076: 0000 .2byte 0x0 - 1078: 1038 .2byte 0x1038 - 107a: 0000 .2byte 0x0 - 107c: 1040 .2byte 0x1040 - ... - -Disassembly of section .got.plt: - -00001080 <.got.plt>: - 1080: ffff .2byte 0xffff - 1082: ffff .2byte 0xffff - 1084: 0000 .2byte 0x0 - ... - -Disassembly of section .riscv.attributes: - -00000000 <.riscv.attributes>: - 0: 4341 .2byte 0x4341 - 2: 0000 .2byte 0x0 - 4: 7200 .2byte 0x7200 - 6: 7369 .2byte 0x7369 - 8: 01007663 bgeu zero,a6,14 <_start-0xfec> - c: 0039 .2byte 0x39 - e: 0000 .2byte 0x0 - 10: 1004 .2byte 0x1004 - 12: 7205 .2byte 0x7205 - 14: 3376 .2byte 0x3376 - 16: 6932 .2byte 0x6932 - 18: 7032 .2byte 0x7032 - 1a: 5f31 .2byte 0x5f31 - 1c: 326d .2byte 0x326d - 1e: 3070 .2byte 0x3070 - 20: 615f 7032 5f31 .byte 0x5f, 0x61, 0x32, 0x70, 0x31, 0x5f - 26: 3266 .2byte 0x3266 - 28: 3270 .2byte 0x3270 - 2a: 7a5f 6369 7273 .byte 0x5f, 0x7a, 0x69, 0x63, 0x73, 0x72 - 30: 7032 .2byte 0x7032 - 32: 5f30 .2byte 0x5f30 - 34: 697a .2byte 0x697a - 36: 6566 .2byte 0x6566 - 38: 636e .2byte 0x636e - 3a: 6965 .2byte 0x6965 - 3c: 7032 .2byte 0x7032 - 3e: 0030 .2byte 0x30 - 40: 0108 .2byte 0x108 - 42: 0b0a .2byte 0xb0a - -Disassembly of section .comment: - -00000000 <.comment>: - 0: 3a434347 .4byte 0x3a434347 - 4: 2820 .2byte 0x2820 - 6: 736f7263 bgeu t5,s6,72a <_start-0x8d6> - a: 6f6f7473 csrrci s0,0x6f6,30 - e: 2d6c .2byte 0x2d6c - 10: 474e .2byte 0x474e - 12: 3120 .2byte 0x3120 - 14: 322e .2byte 0x322e - 16: 2e35 .2byte 0x2e35 - 18: 2e30 .2byte 0x2e30 - 1a: 3538 .2byte 0x3538 - 1c: 365f 6331 6334 .byte 0x5f, 0x36, 0x31, 0x63, 0x34, 0x63 - 22: 20296163 bltu s2,sp,224 <_start-0xddc> - 26: 2e39 .2byte 0x2e39 - 28: 2e35 .2byte 0x2e35 - 2a: 0030 .2byte 0x30 diff --git a/hardware/bootrom/bootrom.elf b/hardware/bootrom/bootrom.elf deleted file mode 100755 index 8c26b6e..0000000 Binary files a/hardware/bootrom/bootrom.elf and /dev/null differ diff --git a/hardware/bootrom/bootrom.sv b/hardware/bootrom/bootrom.sv deleted file mode 100644 index c3b8995..0000000 --- a/hardware/bootrom/bootrom.sv +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Description: Automatically generated bootrom -// -// Generated by util/scripts/generate_bootrom.py - -module bootrom #( - /* Automatically generated. DO NOT CHANGE! */ - parameter int unsigned DataWidth = 128, - parameter int unsigned AddrWidth = 32 -) ( - input logic clk_i, - input logic req_i, - input logic [AddrWidth-1:0] addr_i, - output logic [DataWidth-1:0] rdata_o -); - localparam int RomSize = 8; - localparam int AddrBits = RomSize > 1 ? $clog2(RomSize) : 1; - - const logic [RomSize-1:0][DataWidth-1:0] mem = { - 128'h00001040000010380000000000001038, - 128'h000000000000000400000000a0000000, - 128'h00000000800000000000000000000800, - 128'hbffff800000000000000001000001000, - 128'hffdff06f10500073000380670003a383, - 128'h0203839301c383b30105ae0300c5a383, - 128'h105000733047d07306c5a58300000597, - 128'hf1402573305310730783230300000317 - }; - - logic [AddrBits-1:0] addr_q; - - always_ff @(posedge clk_i) begin - if (req_i) begin - addr_q <= addr_i[AddrBits-1+4:4]; - end - end - - // this prevents spurious Xes from propagating into - // the speculative fetch stage of the core - assign rdata_o = (addr_q < RomSize) ? mem[addr_q] : '0; -endmodule diff --git a/hardware/cachepool_peripheral/cachepool_peripheral.sv b/hardware/cachepool_peripheral/cachepool_peripheral.sv index 6326cfa..d539b76 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral.sv @@ -89,7 +89,7 @@ module cachepool_peripheral //////////// L1 DCache //////////// logic [NumPerfCounters-1:0][47:0] perf_counter_d, perf_counter_q; - logic [31:0] cl_clint_d, cl_clint_q; + logic [NrCores-1:0] cl_clint_d, cl_clint_q; logic [9:0] l1d_spm_size_d, l1d_spm_size_q; logic [3:0] l1d_private_d, l1d_private_q; addr_t private_start_addr_d, private_start_addr_q; @@ -172,7 +172,7 @@ module cachepool_peripheral end `FF(private_start_addr_q, private_start_addr_d, 32'hA000_0000, clk_i, rst_ni) - `FF(l1d_private_q, l1d_private_d, '0, clk_i, rst_ni) + `FF(l1d_private_q, l1d_private_d, 0, clk_i, rst_ni) `FF(l1d_lock_q, l1d_lock_d, '0, clk_i, rst_ni) // To show if the current flush/invalidation is complete assign hw2reg.l1d_flush_status.d = (l1d_lock_q != '0); diff --git a/hardware/generated/floo_cachepool_noc_pkg.sv b/hardware/generated/floo_cachepool_noc_pkg.sv new file mode 100644 index 0000000..e475728 --- /dev/null +++ b/hardware/generated/floo_cachepool_noc_pkg.sv @@ -0,0 +1,240 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// AUTOMATICALLY GENERATED! DO NOT EDIT! + +`include "axi/typedef.svh" +`include "floo_noc/typedef.svh" + +package floo_cachepool_noc_pkg; + + import floo_pkg::*; + + ///////////////////// + // Address Map // + ///////////////////// + + typedef enum logic[3:0] { + GroupX0Y0 = 0, + GroupX0Y1 = 1, + GroupX1Y0 = 2, + GroupX1Y1 = 3, + Hbm0 = 4, + Hbm1 = 5, + Hbm2 = 6, + Hbm3 = 7, + HostPeri = 8, + NumEndpoints = 9} ep_id_e; + + + + typedef enum logic[2:0] { + Hbm0SamIdx = 0, + Hbm1SamIdx = 1, + Hbm2SamIdx = 2, + Hbm3SamIdx = 3, + HostPeriSamIdx = 5} sam_idx_e; + + + + typedef logic[0:0] rob_idx_t; +typedef logic[0:0] port_id_t; +typedef logic[3:0] id_t; +typedef logic[8:0] route_t; + + + typedef struct packed { + id_t idx; + id_t start_addr; + id_t end_addr; + } route_map_rule_t; + + localparam int unsigned SamNumRules = 6; + +typedef struct packed { + id_t idx; + logic [31:0] start_addr; + logic [31:0] end_addr; +} sam_rule_t; + +localparam sam_rule_t[SamNumRules-1:0] Sam = '{ +'{ idx: 8, + start_addr: 32'h00000000, + end_addr: 32'h7fffffff},// HostPeri +'{ idx: 8, + start_addr: 32'ha0000000, + end_addr: 32'hc000ffff},// HostPeri +'{ idx: 7, + start_addr: 32'h80300000, + end_addr: 32'h80400000},// Hbm3 +'{ idx: 6, + start_addr: 32'h80200000, + end_addr: 32'h80300000},// Hbm2 +'{ idx: 5, + start_addr: 32'h80100000, + end_addr: 32'h80200000},// Hbm1 +'{ idx: 4, + start_addr: 32'h80000000, + end_addr: 32'h80100000} // Hbm0 + +}; + + + localparam route_t[NumEndpoints-1:0][NumEndpoints-1:0] RoutingTables = '{ +'{ +9'b000000000,// -> host_peri_ni +9'b001001000,// -> hbm_ni_3 +9'b000001001,// -> hbm_ni_2 +9'b000011000,// -> hbm_ni_1 +9'b000000011,// -> hbm_ni_0 +9'b100001000,// -> group_ni_1_1 +9'b000100001,// -> group_ni_1_0 +9'b000100000,// -> group_ni_0_1 +9'b000000100 // -> group_ni_0_0 +}, +'{ +9'b010010011,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000000100,// -> group_ni_1_1 +9'b000100010,// -> group_ni_1_0 +9'b000100011,// -> group_ni_0_1 +9'b100010011 // -> group_ni_0_0 +}, +'{ +9'b000010011,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000100000,// -> group_ni_1_1 +9'b000000100,// -> group_ni_1_0 +9'b100000011,// -> group_ni_0_1 +9'b000100011 // -> group_ni_0_0 +}, +'{ +9'b000010010,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000100001,// -> group_ni_1_1 +9'b100001010,// -> group_ni_1_0 +9'b000000100,// -> group_ni_0_1 +9'b000100010 // -> group_ni_0_0 +}, +'{ +9'b000000010,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b100001000,// -> group_ni_1_1 +9'b000100001,// -> group_ni_1_0 +9'b000100000,// -> group_ni_0_1 +9'b000000100 // -> group_ni_0_0 +}, +'{ +9'b010010011,// -> host_peri_ni +9'b000000001,// -> hbm_ni_3 +9'b000001010,// -> hbm_ni_2 +9'b000011011,// -> hbm_ni_1 +9'b011010011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000010011,// -> host_peri_ni +9'b000001000,// -> hbm_ni_3 +9'b000000001,// -> hbm_ni_2 +9'b011000011,// -> hbm_ni_1 +9'b000011011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000010010,// -> host_peri_ni +9'b000001001,// -> hbm_ni_3 +9'b001001010,// -> hbm_ni_2 +9'b000000011,// -> hbm_ni_1 +9'b000011010,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000000010,// -> host_peri_ni +9'b001001000,// -> hbm_ni_3 +9'b000001001,// -> hbm_ni_2 +9'b000011000,// -> hbm_ni_1 +9'b000000011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}} +; + + + localparam route_cfg_t RouteCfg = '{ RouteAlgo: SourceRouting, + UseIdTable: 1'b1, + XYAddrOffsetX: 0, + XYAddrOffsetY: 0, + IdAddrOffset: 0, + NumSamRules: 6, + NumRoutes: 9, + CollectiveCfg: '{ OpCfg: '{ EnNarrowMulticast: 1'b0, + EnWideMulticast: 1'b0, + EnLsbAnd: 1'b0, + EnFpAdd: 1'b0, + EnFpMul: 1'b0, + EnFpMin: 1'b0, + EnFpMax: 1'b0, + EnIntAdd: 1'b0, + EnIntMul: 1'b0, + EnIntMinS: 1'b0, + EnIntMinU: 1'b0, + EnIntMaxS: 1'b0, + EnIntMaxU: 1'b0}, + NarrRedCfg: RedDefaultCfg, + WideRedCfg: RedDefaultCfg}}; + + + + typedef logic[31:0] axi_wide_in_addr_t; +typedef logic[255:0] axi_wide_in_data_t; +typedef logic[31:0] axi_wide_in_strb_t; +typedef logic[1:0] axi_wide_in_id_t; +typedef logic[0:0] axi_wide_in_user_t; +`AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) + + + typedef logic[31:0] axi_wide_out_addr_t; +typedef logic[255:0] axi_wide_out_data_t; +typedef logic[31:0] axi_wide_out_strb_t; +typedef logic[1:0] axi_wide_out_id_t; +typedef logic[0:0] axi_wide_out_user_t; +`AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, axi_wide_out_user_t) + + + + `FLOO_TYPEDEF_HDR_T(hdr_t, route_t, id_t, axi_ch_e, rob_idx_t) + localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, + DataWidth: 256, + InIdWidth: 2, + OutIdWidth: 2, + UserWidth: 1}; +`FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_wide_in, AxiCfg, hdr_t) + +`FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp) + + +endpackage diff --git a/hardware/src/axi_hier_interco.sv b/hardware/src/axi_hier_interco.sv new file mode 100644 index 0000000..b4ab001 --- /dev/null +++ b/hardware/src/axi_hier_interco.sv @@ -0,0 +1,322 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Author: Samuel Riedel + +// Implement a hierarchical AXI interconnect. Below shows one level of the interconnect. This module +// recursively instantiates itself and creates a tree of interconnects, each node with `Radix` slave +// ports. +// +// AXI Mux Read-only ID Width +// Cache Converter +// |‾╲ +// +-------->| ╲ +// | + +-------+ +-------+ +// +-------->| M | | | | | +// | U |---->| $ |---->| > |----> +// | X | | | | | +// | + +-------+ +-------+ +// +-------->| ╱ +// |_╱ +// Internal Cache +// Slave type type type Master type + +module axi_hier_interco + import cachepool_pkg::ro_cache_ctrl_t; +#( + parameter int unsigned NumSlvPorts = 0, + parameter int unsigned NumMstPorts = 0, + parameter int unsigned Radix = 2, + parameter int unsigned EnableCache = 0, + parameter int unsigned CacheLineWidth = 0, + parameter int unsigned CacheSizeByte = 0, + parameter int unsigned CacheSets = 0, + parameter int unsigned AddrWidth = 0, + parameter int unsigned DataWidth = 0, + parameter int unsigned SlvIdWidth = 0, + parameter int unsigned MstIdWidth = 0, + parameter int unsigned UserWidth = 0, + parameter type slv_req_t = logic, + parameter type slv_resp_t = logic, + parameter type mst_req_t = logic, + parameter type mst_resp_t = logic +) ( + input logic clk_i, + input logic rst_ni, + input logic test_i, + input ro_cache_ctrl_t ro_cache_ctrl_i, + input slv_req_t [NumSlvPorts-1:0] slv_req_i, + output slv_resp_t [NumSlvPorts-1:0] slv_resp_o, + output mst_req_t [NumMstPorts-1:0] mst_req_o, + input mst_resp_t [NumMstPorts-1:0] mst_resp_i +); + + //////////////// + // Typedefs // + //////////////// + + localparam int unsigned IntIdWidth = SlvIdWidth + $clog2(NumSlvPorts); + localparam int unsigned CacheIdWidth = EnableCache[0] ? IntIdWidth + 1: IntIdWidth; + localparam int unsigned NrAddrRules = cachepool_pkg::ROCacheNumAddrRules; + + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [DataWidth-1:0] data_t; + typedef logic [DataWidth/8-1:0] strb_t; + typedef logic [SlvIdWidth-1:0] slv_id_t; + typedef logic [MstIdWidth-1:0] mst_id_t; + typedef logic [IntIdWidth-1:0] int_id_t; + typedef logic [CacheIdWidth-1:0] cache_id_t; + typedef logic [UserWidth-1:0] user_t; + + `include "axi/typedef.svh" + // Common AXI types + `AXI_TYPEDEF_W_CHAN_T(w_t, data_t, strb_t, user_t); + // Slave AXI types + `AXI_TYPEDEF_AW_CHAN_T(slv_aw_t, addr_t, slv_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(slv_b_t, slv_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(slv_ar_t, addr_t, slv_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(slv_r_t, data_t, slv_id_t, user_t); + // Intermediate AXI types + `AXI_TYPEDEF_AW_CHAN_T(int_aw_t, addr_t, int_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(int_b_t, int_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(int_ar_t, addr_t, int_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(int_r_t, data_t, int_id_t, user_t); + `AXI_TYPEDEF_REQ_T(int_req_t, int_aw_t, w_t, int_ar_t); + `AXI_TYPEDEF_RESP_T(int_resp_t, int_b_t, int_r_t ); + // Cache AXI types + `AXI_TYPEDEF_AW_CHAN_T(cache_aw_t, addr_t, cache_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(cache_b_t, cache_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(cache_ar_t, addr_t, cache_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(cache_r_t, data_t, cache_id_t, user_t); + `AXI_TYPEDEF_REQ_T(cache_req_t, cache_aw_t, w_t, cache_ar_t); + `AXI_TYPEDEF_RESP_T(cache_resp_t, cache_b_t, cache_r_t ); + + /////////////// + // Interco // + /////////////// + + // Recursive module to implement multiple hierarchy levels at once + + if (NumMstPorts > NumSlvPorts) begin : gen_error + $error("[axi_hier_interco] `NumMstPorts` must be bigger than `NumSlvPorts`."); + end else if (NumMstPorts == NumSlvPorts) begin : gen_top_level + // Top-level, connect the ports to the master ports + for (genvar i = 0; i < NumMstPorts; i++) begin : gen_bypasses + assign mst_req_o[i] = slv_req_i[i]; + assign slv_resp_o[i] = mst_resp_i[i]; + end + end else if (Radix <= 1) begin : gen_error + $error("[axi_hier_interco] `Radix` must be bigger than 1."); + end else if (NumSlvPorts > Radix) begin : gen_axi_level_recursive + // More than one level missing. --> Recursively call this module + // This level will contain `NumMuxes` interconnects + localparam int unsigned NumMuxes = NumSlvPorts / Radix; + if (NumMuxes * Radix != NumSlvPorts) begin : gen_error + $error("[axi_hier_interco] `NumSlvPorts` mod `Radix` must be 0."); + end else begin : gen_level + slv_req_t [NumMuxes-1:0] int_req; + slv_resp_t [NumMuxes-1:0] int_resp; + + for (genvar i = 0; i < NumMuxes; i++) begin : gen_lower_level + axi_hier_interco #( + .NumSlvPorts (Radix ), + .NumMstPorts (1 ), + .Radix (Radix ), + .EnableCache (EnableCache ), + .CacheLineWidth (CacheLineWidth), + .CacheSizeByte (CacheSizeByte ), + .CacheSets (CacheSets ), + .AddrWidth (AddrWidth ), + .DataWidth (DataWidth ), + .SlvIdWidth (SlvIdWidth ), + .MstIdWidth (SlvIdWidth ), + .UserWidth (UserWidth ), + .slv_req_t (slv_req_t ), + .slv_resp_t (slv_resp_t ), + .mst_req_t (slv_req_t ), + .mst_resp_t (slv_resp_t ) + ) i_axi_interco ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .ro_cache_ctrl_i (ro_cache_ctrl_i ), + .slv_req_i (slv_req_i[i*Radix +: Radix] ), + .slv_resp_o (slv_resp_o[i*Radix +: Radix]), + .mst_req_o (int_req[i] ), + .mst_resp_i (int_resp[i] ) + ); + end + + axi_hier_interco #( + .NumSlvPorts (NumMuxes ), + .NumMstPorts (NumMstPorts ), + .Radix (Radix ), + .EnableCache (EnableCache>>1), + .CacheLineWidth (CacheLineWidth), + .CacheSizeByte (CacheSizeByte ), + .CacheSets (CacheSets ), + .AddrWidth (AddrWidth ), + .DataWidth (DataWidth ), + .SlvIdWidth (SlvIdWidth ), + .MstIdWidth (MstIdWidth ), + .UserWidth (UserWidth ), + .slv_req_t (slv_req_t ), + .slv_resp_t (slv_resp_t ), + .mst_req_t (mst_req_t ), + .mst_resp_t (mst_resp_t ) + ) i_axi_interco ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .ro_cache_ctrl_i (ro_cache_ctrl_i), + .slv_req_i (int_req ), + .slv_resp_o (int_resp ), + .mst_req_o (mst_req_o ), + .mst_resp_i (mst_resp_i ) + ); + end + end else if (NumSlvPorts <= Radix && NumMstPorts == 1) begin : gen_bottom_level + + // Intermediate AXI channel + int_req_t int_req; + int_resp_t int_resp; + cache_req_t cache_req; + cache_resp_t cache_resp; + + axi_mux #( + // AXI parameter and channel types + .SlvAxiIDWidth (SlvIdWidth ), // AXI ID width, slave ports + .slv_aw_chan_t (slv_aw_t ), // AW Channel Type, slave ports + .mst_aw_chan_t (int_aw_t ), // AW Channel Type, master port + .w_chan_t (w_t ), // W Channel Type, all ports + .slv_b_chan_t (slv_b_t ), // B Channel Type, slave ports + .mst_b_chan_t (int_b_t ), // B Channel Type, master port + .slv_ar_chan_t (slv_ar_t ), // AR Channel Type, slave ports + .mst_ar_chan_t (int_ar_t ), // AR Channel Type, master port + .slv_r_chan_t (slv_r_t ), // R Channel Type, slave ports + .mst_r_chan_t (int_r_t ), // R Channel Type, master port + .slv_req_t (slv_req_t ), // Slave port request type + .slv_resp_t (slv_resp_t ), // Slave port response type + .mst_req_t (int_req_t ), // Master ports request type + .mst_resp_t (int_resp_t ), // Master ports response type + .NoSlvPorts (NumSlvPorts), // Number of slave ports + // Maximum number of outstanding transactions per write + .MaxWTrans (8 ), + // If enabled, this multiplexer is purely combinatorial + .FallThrough (1'b0 ), + // add spill register on write master ports, adds a cycle latency on write channels + .SpillAw (1'b1 ), + .SpillW (1'b1 ), + .SpillB (1'b1 ), + // add spill register on read master ports, adds a cycle latency on read channels + .SpillAr (1'b1 ), + .SpillR (1'b1 ) + ) i_axi_mux ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .slv_reqs_i (slv_req_i ), + .slv_resps_o (slv_resp_o), + .mst_req_o (int_req ), + .mst_resp_i (int_resp ) + ); + + if (EnableCache[0]) begin: gen_ro_cache + localparam int unsigned LineCount = CacheSizeByte/(CacheSets*CacheLineWidth/8); + snitch_read_only_cache #( + .LineWidth (CacheLineWidth), + .LineCount (LineCount ), + .WayCount (CacheSets ), + .AxiAddrWidth (AddrWidth ), + .AxiDataWidth (DataWidth ), + .AxiIdWidth (IntIdWidth ), + .AxiUserWidth (UserWidth ), + .MaxTrans (32'd16 ), + .NrAddrRules (NrAddrRules ), + .SerialLookup (0 ), + .slv_req_t (int_req_t ), + .slv_rsp_t (int_resp_t ), + .mst_req_t (cache_req_t ), + .mst_rsp_t (cache_resp_t ) + ) i_snitch_read_only_cache ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .enable_i (ro_cache_ctrl_i.enable ), + .flush_valid_i (ro_cache_ctrl_i.flush_valid), + .flush_ready_o (/* unused */ ), + .icache_events_o (/* unused */ ), + .start_addr_i (ro_cache_ctrl_i.start_addr ), + .end_addr_i (ro_cache_ctrl_i.end_addr ), + .axi_slv_req_i (int_req ), + .axi_slv_rsp_o (int_resp ), + .axi_mst_req_o (cache_req ), + .axi_mst_rsp_i (cache_resp ), + .sram_cfg_data_i ('0 ), + .sram_cfg_tag_i ('0 ), + .sram_cfg_out_data_o (/* unused */ ), + .sram_cfg_out_tag_o (/* unused */ ) + ); + end else begin: gen_no_ro_cache + assign cache_req = int_req; + assign int_resp = cache_resp; + end + + axi_id_remap #( + .AxiSlvPortIdWidth (CacheIdWidth ), + .AxiSlvPortMaxUniqIds (2**MstIdWidth), + .AxiMaxTxnsPerId (8 ), + .AxiMstPortIdWidth (MstIdWidth ), + .slv_req_t (cache_req_t ), + .slv_resp_t (cache_resp_t ), + .mst_req_t (mst_req_t ), + .mst_resp_t (mst_resp_t ) + ) i_axi_id_remap ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (cache_req ), + .slv_resp_o (cache_resp), + .mst_req_o (mst_req_o ), + .mst_resp_i (mst_resp_i) + ); + + // Check all the AXI widths + if ($bits(slv_req_i[0].aw.addr) != AddrWidth) + $error("[axi_hier_interco] `slv_req_i.aw.addr` does not match AddrWidth."); + if ($bits(slv_req_i[0].w.data) != DataWidth) + $error("[axi_hier_interco] `slv_req_i.w.data` does not match DataWidth."); + if ($bits(slv_req_i[0].aw.id) != SlvIdWidth) + $error("[axi_hier_interco] `slv_req_i.aw.id` does not match SlvIdWidth."); + if ($bits(slv_req_i[0].aw.user) != UserWidth) + $error("[axi_hier_interco] `slv_req_i.aw.user` does not match UserWidth."); + + if ($bits(mst_req_o[0].aw.addr) != AddrWidth) + $error("[axi_hier_interco] `mst_req_o.aw.addr` does not match AddrWidth."); + if ($bits(mst_req_o[0].w.data) != DataWidth) + $error("[axi_hier_interco] `mst_req_o.w.data` does not match DataWidth."); + if ($bits(mst_req_o[0].aw.id) != MstIdWidth) + $error("[axi_hier_interco] `mst_req_o.aw.id` does not match MstIdWidth."); + if ($bits(mst_req_o[0].aw.user) != UserWidth) + $error("[axi_hier_interco] `mst_req_o.aw.user` does not match UserWidth."); + + if ($bits(int_req.aw.addr) != AddrWidth) + $error("[axi_hier_interco] `int_req.aw.addr` does not match AddrWidth."); + if ($bits(int_req.w.data) != DataWidth) + $error("[axi_hier_interco] `int_req.w.data` does not match DataWidth."); + if ($bits(int_req.aw.id) != IntIdWidth) + $error("[axi_hier_interco] `int_req.aw.id` does not match IntIdWidth."); + if ($bits(int_req.aw.user) != UserWidth) + $error("[axi_hier_interco] `int_req.aw.user` does not match UserWidth."); + + if ($bits(cache_req.aw.addr) != AddrWidth) + $error("[axi_hier_interco] `cache_req.aw.addr` does not match AddrWidth."); + if ($bits(cache_req.w.data) != DataWidth) + $error("[axi_hier_interco] `cache_req.w.data` does not match DataWidth."); + if ($bits(cache_req.aw.id) != CacheIdWidth) + $error("[axi_hier_interco] `cache_req.aw.id` does not match CacheIdWidth."); + if ($bits(cache_req.aw.user) != UserWidth) + $error("[axi_hier_interco] `cache_req.aw.user` does not match UserWidth."); + end else begin: gen_error + $error("[axi_hier_interco] Cannot build a tree with those parameters."); + end +endmodule diff --git a/hardware/src/cachepool_cc.sv b/hardware/src/cachepool_cc.sv index 86c8d7e..1950cd6 100644 --- a/hardware/src/cachepool_cc.sv +++ b/hardware/src/cachepool_cc.sv @@ -23,12 +23,6 @@ module cachepool_cc parameter int unsigned DataWidth = 0, /// User width of the buses. parameter int unsigned UserWidth = 0, - /// Data width of the AXI DMA buses. - parameter int unsigned DMADataWidth = 0, - /// Id width of the AXI DMA bus. - parameter int unsigned DMAIdWidth = 0, - parameter int unsigned DMAAxiReqFifoDepth = 0, - parameter int unsigned DMAReqFifoDepth = 0, parameter int unsigned SpmStackDepth = 512, /// Data port request type. @@ -75,7 +69,6 @@ module cachepool_cc parameter bit XF16ALT = 0, parameter bit XF8ALT = 0, /// Enable Snitch DMA - parameter bit Xdma = 0, parameter int unsigned NumIntOutstandingLoads = 0, parameter int unsigned NumIntOutstandingMem = 0, parameter int unsigned NumSpatzOutstandingLoads = 0, @@ -176,7 +169,7 @@ module cachepool_cc .VMSupport (1'b0 ), .RVE (RVE ), .FP_EN (FPEn ), - .Xdma (Xdma ), + .Xdma (1'b0 ), .RVF (RVF ), .RVD (RVD ), .RVV (RVV ), diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index df687a3..c9102ef 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -4,21 +4,11 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" -`include "common_cells/assertions.svh" -`include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" `include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" -`include "reqrsp_interface/typedef.svh" -`include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" -/// A single-tile cluster implementation for CachePool +/// CachePool cluster: instantiates NumGroups groups connected via FlooNoC mesh, +/// with shared L2 memory and peripheral fabric. module cachepool_cluster import cachepool_pkg::*; import spatz_pkg::*; @@ -47,10 +37,6 @@ module cachepool_cluster parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -65,8 +51,6 @@ module cachepool_cluster /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = 0, @@ -115,20 +99,20 @@ module cachepool_cluster input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, + input logic debug_req_i, /// End of Computing indicator to notify the host/tb output logic [3:0] eoc_o, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. @@ -157,39 +141,16 @@ module cachepool_cluster // Imports // --------- import snitch_pkg::*; - import snitch_icache_pkg::icache_events_t; // --------- // Constants // --------- - /// Minimum width to hold the core number. - localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); - - // Enlarge the address width for Spatz due to cache - localparam int unsigned TCDMAddrWidth = 32; - - // Core Request, SoC Request - localparam int unsigned NrNarrowMasters = 2; - localparam int unsigned WideIdWidthOut = AxiIdWidthOut; - localparam int unsigned WideIdWidthIn = WideIdWidthOut - $clog2(NumClusterMst); - - // Cache XBar configuration struct - localparam axi_pkg::xbar_cfg_t CacheXbarCfg = '{ - NoSlvPorts : NumClusterMst*NumTiles, - NoMstPorts : ClusterWideOutAxiPorts, - MaxMstTrans : MaxMstTrans, - MaxSlvTrans : MaxSlvTrans, - FallThrough : 1'b0, - LatencyMode : XbarLatency, - AxiIdWidthSlvPorts: WideIdWidthIn, - AxiIdUsedSlvPorts : WideIdWidthIn, - UniqueIds : 1'b0, - AxiAddrWidth : AxiAddrWidth, - AxiDataWidth : AxiDataWidth, - NoAddrRules : ClusterWideOutAxiPorts - 1, - default : '0 - }; + localparam int unsigned WideIdWidthIn = WideIdWidthOut - ClusterRouteIdWidth - GroupMuxIdBits; + + // Pre-mux AXI ID width: per-group reqrsp_to_axi output. + // The multi-group axi_mux adds GroupMuxIdBits on top to reach WideIdWidthOut. + localparam int unsigned WideIdWidthPreMux = WideIdWidthOut - GroupMuxIdBits; // -------- // Typedefs @@ -201,37 +162,39 @@ module cachepool_cluster typedef logic [WideIdWidthOut-1:0] id_cache_slv_t; typedef logic [AxiUserWidth-1:0] user_cache_t; - `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) - `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) - - `REG_BUS_TYPEDEF_ALL(reg_cache, addr_t, data_cache_t, strb_cache_t) + // reqrsp_to_axi output type: full GroupAxiIdOutWidth-bit IDs (decoupled from WideIdWidthPreMux + // which now equals WideRefillIdWidth after per-group ID remapping). + typedef logic [GroupAxiIdOutWidth-1:0] id_cache_premux_t; + // Remapper output / mux slave input type: bounded WideRefillIdWidth-bit IDs. + typedef logic [WideIdWidthPreMux-1:0] id_cache_remap_t; - typedef struct packed { - int unsigned idx; - addr_t start_addr; - addr_t end_addr; - } xbar_rule_t; - - `SNITCH_VM_TYPEDEF(AxiAddrWidth) + `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) + // Post-mux AXI types (same as before — used for axi_cut and output). + `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) + // reqrsp_to_axi output AXI types (full GroupAxiIdOutWidth-bit IDs). + `AXI_TYPEDEF_ALL(axi_premux_cache, addr_t, id_cache_premux_t, data_cache_t, strb_cache_t, user_cache_t) + // Remapped AXI types: WideRefillIdWidth-bit IDs, fed into the inter-group mux / future NoC. + `AXI_TYPEDEF_ALL(axi_remap_cache, addr_t, id_cache_remap_t, data_cache_t, strb_cache_t, user_cache_t) // ---------------- // Wire Definitions // ---------------- // 1. AXI - axi_mst_cache_req_t [NumTiles-1:0][TileNarrowAxiPorts-1:0] axi_tile_req; - axi_mst_cache_resp_t [NumTiles-1:0][TileNarrowAxiPorts-1:0] axi_tile_rsp; - axi_slv_cache_req_t [ClusterWideOutAxiPorts-1 :0] wide_axi_slv_req; - axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1 :0] wide_axi_slv_rsp; - axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; - axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; - - // 2. BootROM - reg_cache_req_t [NumTiles-1:0] bootrom_reg_req; - reg_cache_rsp_t [NumTiles-1:0] bootrom_reg_rsp; + // Post-mux wide AXI (one per L2 channel, merged across groups). + axi_slv_cache_req_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_req; + axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_rsp; + // Per-group pre-mux wide AXI (per group, per L2 channel): full GroupAxiIdOutWidth-bit IDs. + axi_premux_cache_req_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_premux_req; + axi_premux_cache_resp_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_premux_rsp; + // Per-group remapped wide AXI: WideRefillIdWidth-bit IDs, fed into the inter-group mux. + axi_remap_cache_req_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_remap_req; + axi_remap_cache_resp_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_remap_rsp; + // Narrow AXI per tile (UART + Periph). + axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; + axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; // 3. Peripherals axi_addr_t private_start_addr; - icache_events_t [NrCores-1:0] icache_events; logic icache_prefetch_enable; logic [NrCores-1:0] cl_interrupt; logic [$clog2(L1AddrWidth)-1:0] dynamic_offset; @@ -241,503 +204,316 @@ module cachepool_cluster logic [NumTiles-1:0] l1d_insn_ready; logic [NumTiles-1:0] l1d_busy; + // Per-group error signals. + logic [NumGroups-1:0] group_error; + + // Inter-group NoC mesh signals (indexed by group, then direction, then port) + noc_group_req_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out_ready; + noc_group_req_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in_ready; + noc_group_rsp_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out_ready; + noc_group_rsp_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in_ready; + // --------------- - // CachePool Tile + // CachePool Group // --------------- - cache_trans_req_t [NumL1CacheCtrl-1 :0] cache_refill_req; - cache_trans_rsp_t [NumL1CacheCtrl-1 :0] cache_refill_rsp; - - cache_trans_req_t [NumTiles-1 :0] cache_core_req; - cache_trans_rsp_t [NumTiles-1 :0] cache_core_rsp; - - cache_trans_req_chan_t [NumTiles*NumClusterMst-1 :0] tile_req_chan; - cache_trans_rsp_chan_t [NumTiles*NumClusterMst-1 :0] tile_rsp_chan; - logic [NumTiles*NumClusterMst-1 :0] tile_req_valid, tile_req_ready, tile_rsp_valid, tile_rsp_ready; - - l2_req_t [ClusterWideOutAxiPorts-1 :0] l2_req; - l2_rsp_t [ClusterWideOutAxiPorts-1 :0] l2_rsp; - - cache_trans_req_chan_t [ClusterWideOutAxiPorts-1 :0] l2_req_chan; - cache_trans_rsp_chan_t [ClusterWideOutAxiPorts-1 :0] l2_rsp_chan; - logic [ClusterWideOutAxiPorts-1 :0] l2_req_valid, l2_req_ready , l2_rsp_valid, l2_rsp_ready; - - typedef logic [$clog2(NumClusterMst*NumTiles)-1:0] l2_sel_t; - // one more bit for out-of-range alert - typedef logic [$clog2(ClusterWideOutAxiPorts) :0] tile_sel_err_t; - typedef logic [$clog2(ClusterWideOutAxiPorts)-1:0] tile_sel_t; - - // Which l2 we want to select for each req - tile_sel_err_t [NumTiles*NumClusterMst-1 :0] tile_sel_err; - tile_sel_t [NumTiles*NumClusterMst-1 :0] tile_sel; - // Which tile we selected for each req - l2_sel_t [ClusterWideOutAxiPorts-1 :0] tile_selected; - // which tile we want to select for each rsp - l2_sel_t [ClusterWideOutAxiPorts-1 :0] l2_sel; - // What is the priority for response wiring? - // Here we want to make sure the responses from one burst - // continues until done - // If the rsp is a burst with blen != 0, then we will keep - // the rr same, until got a burst rsp with blen == 0 - tile_sel_t [NumTiles*NumClusterMst-1 :0] l2_rsp_rr; - - logic [NumTiles*NumClusterMst-1 :0] rr_lock_d, rr_lock_q; - tile_sel_t [NumTiles*NumClusterMst-1 :0] l2_prio_d, l2_prio_q; - - - l2_sel_t [ClusterWideOutAxiPorts-1:0] port_id; - - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - assign port_id[i] = l2_rsp[i].p.user.tile_id * NumClusterMst + l2_rsp[i].p.user.bank_id; - end - - - if (Burst_Enable) begin : gen_burst_ext_sel - `FF(rr_lock_q, rr_lock_d, 1'b0) - `FF(l2_prio_q, l2_prio_d, 1'b0) - - for (genvar port = 0; port < NumTiles*NumClusterMst; port ++) begin : gen_rsp_rr - tile_sel_t l2_rr; - logic [ClusterWideOutAxiPorts-1:0] arb_valid; - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - // Used to check the round-robin selection - assign arb_valid[i] = (port_id[i] == port) & l2_rsp_valid[i]; - end - - always_comb begin - l2_prio_d[port] = l2_prio_q[port]; - rr_lock_d[port] = rr_lock_q[port]; - - // Determine the priority we give - // round-robin or locked to previous value? - if (|arb_valid) begin - if (rr_lock_q[port]) begin - // rr is locked because of burst - l2_prio_d[port] = l2_prio_q[port]; - end else begin - l2_prio_d[port] = l2_rr; - end - end - // assigned to xbar rr_i - l2_rsp_rr[port] = l2_prio_d[port]; - - // Lock judgement - // Should it work on the l2_rsp instead of tile_rsp? - if (tile_rsp_chan[port].user.burst.is_burst & |arb_valid) begin - // We got a burst response - if (tile_rsp_chan[port].user.burst.burst_len == 0) begin - // this is the last transaction within a burt, remove lock - rr_lock_d[port] = 1'b0; - end else begin - // the burst response is not finished yet, lock the rr - rr_lock_d[port] = 1'b1; - end - end - end - - // We use the rr_arb_tree to get the round-robin selection - // No data is needed here, only need the handshaking - rr_arb_tree #( - .NumIn ( ClusterWideOutAxiPorts ), - .DataType ( logic ), - .ExtPrio ( 1'b0 ), - .AxiVldRdy ( 1'b1 ), - .LockIn ( 1'b1 ) - ) i_rr_arb_tree ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .flush_i ( '0 ), - .rr_i ( '0 ), - .req_i ( arb_valid ), - .gnt_o ( /*not used*/ ), - .data_i ( '0 ), - .req_o ( /*not used*/ ), - .gnt_i ( tile_rsp_ready[port] ), - .data_o ( /*not used*/ ), - .idx_o ( l2_rr ) + // Per-group L2 reqrsp ports (one per L2 channel per group). + l2_req_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] l2_req; + l2_rsp_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] l2_rsp; + + assign error_o = |group_error; + + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_group_y + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_group_x + // Flat group index: g = gy * NumGroupsX + gx + localparam int unsigned g = gy * NumGroupsX + gx; + cachepool_group_noc_wrapper #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoreGroup ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks / NumGroups ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_group ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( group_error[g] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id_i + 10'(g * NumCoreGroup) ), + .tile_base_id_i ( TileIDWidth'(g * NumTilesPerGroup) ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .private_start_addr_i ( private_start_addr ), + .axi_narrow_req_o ( axi_out_req [g*NumTilesPerGroup +: NumTilesPerGroup] ), + .axi_narrow_rsp_i ( axi_out_resp[g*NumTilesPerGroup +: NumTilesPerGroup] ), + // DRAM refill reqrsp (post-xbar, one per L2 channel) + .l2_req_o ( l2_req[g] ), + .l2_rsp_i ( l2_rsp[g] ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable ), + .cl_interrupt_i ( cl_interrupt [g*NumCoreGroup +: NumCoreGroup] ), + .dynamic_offset_i ( dynamic_offset ), + .l1d_private_i ( l1d_private ), + .l1d_insn_i ( l1d_insn ), + .l1d_insn_valid_i ( l1d_insn_valid ), + .l1d_insn_ready_o ( l1d_insn_ready[g*NumTilesPerGroup +: NumTilesPerGroup]), + .l1d_busy_i ( l1d_busy [g*NumTilesPerGroup +: NumTilesPerGroup]), + .group_xy_id_i ( group_xy_id_t'{x: gx, + y: gy, + port_id: 1'b0} ), + .noc_req_o ( noc_req_out [g] ), + .noc_req_valid_o ( noc_req_out_valid[g] ), + .noc_req_ready_i ( noc_req_out_ready[g] ), + .noc_req_i ( noc_req_in [g] ), + .noc_req_valid_i ( noc_req_in_valid [g] ), + .noc_req_ready_o ( noc_req_in_ready [g] ), + .noc_rsp_o ( noc_rsp_out [g] ), + .noc_rsp_valid_o ( noc_rsp_out_valid[g] ), + .noc_rsp_ready_i ( noc_rsp_out_ready[g] ), + .noc_rsp_i ( noc_rsp_in [g] ), + .noc_rsp_valid_i ( noc_rsp_in_valid [g] ), + .noc_rsp_ready_o ( noc_rsp_in_ready [g] ) ); end - end else begin - assign l2_prio_d = '0; - assign l2_prio_q = '0; - assign rr_lock_d = '0; - assign rr_lock_q = '0; - assign l2_rsp_rr = '0; end - if (NumTiles > 1) begin : gen_group - cachepool_group #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NrCores ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_group ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error_o ), - .debug_req_i ( debug_req_i ), - .meip_i ( meip_i ), - .mtip_i ( mtip_i ), - .msip_i ( msip_i ), - .hart_base_id_i ( hart_base_id_i ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .private_start_addr_i ( private_start_addr ), - .axi_narrow_req_o ( axi_out_req ), - .axi_narrow_rsp_i ( axi_out_resp ), - .axi_wide_req_o ( axi_tile_req ), - .axi_wide_rsp_i ( axi_tile_rsp ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req ), - .cache_refill_rsp_i ( cache_refill_rsp ), - // Peripherals - .icache_events_o ( icache_events ), - .icache_prefetch_enable_i ( icache_prefetch_enable ), - .cl_interrupt_i ( cl_interrupt ), - .dynamic_offset_i ( dynamic_offset ), - .l1d_private_i ( l1d_private ), - .l1d_insn_i ( l1d_insn ), - .l1d_insn_valid_i ( l1d_insn_valid ), - .l1d_insn_ready_o ( l1d_insn_ready ), - .l1d_busy_i ( l1d_busy ) - ); - // TODO: 2 axi ports converted lost correct assignments - // 1. tile id? - // 2. mux then convert? - for (genvar t = 0; t < NumTiles; t ++) begin : gen_axi_converter - axi_to_reqrsp #( - .axi_req_t ( axi_mst_cache_req_t ), - .axi_rsp_t ( axi_mst_cache_resp_t ), - .AddrWidth ( AxiAddrWidth ), - .DataWidth ( AxiDataWidth ), - .UserWidth ( $bits(refill_user_t) ), - .IdWidth ( AxiIdWidthIn ), - .BufDepth ( NumSpatzOutstandingLoads ), - .reqrsp_req_t ( cache_trans_req_t ), - .reqrsp_rsp_t ( cache_trans_rsp_t ) - ) i_axi2reqrsp ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .busy_o ( ), - .axi_req_i ( axi_tile_req [t][TileMem] ), - .axi_rsp_o ( axi_tile_rsp [t][TileMem] ), - .reqrsp_req_o ( cache_core_req[t] ), - .reqrsp_rsp_i ( cache_core_rsp[t] ) - ); + // ---------------------------- + // Inter-group NoC mesh wiring + // ---------------------------- + + // East-West (horizontal) interior connections + for (genvar gx = 0; gx < NumGroupsX-1; gx++) begin : gen_ew_conn + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_ew_conn_y + // East output of (gx,gy) → West input of (gx+1,gy) + assign noc_req_in [gx+1 + gy*NumGroupsX][3] = noc_req_out [gx + gy*NumGroupsX][1]; + assign noc_req_in_valid[gx+1 + gy*NumGroupsX][3] = noc_req_out_valid[gx + gy*NumGroupsX][1]; + assign noc_req_out_ready[gx + gy*NumGroupsX][1] = noc_req_in_ready [gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_in [gx+1 + gy*NumGroupsX][3] = noc_rsp_out [gx + gy*NumGroupsX][1]; + assign noc_rsp_in_valid[gx+1 + gy*NumGroupsX][3] = noc_rsp_out_valid[gx + gy*NumGroupsX][1]; + assign noc_rsp_out_ready[gx + gy*NumGroupsX][1] = noc_rsp_in_ready [gx+1 + gy*NumGroupsX][3]; + // West output of (gx+1,gy) → East input of (gx,gy) + assign noc_req_in [gx + gy*NumGroupsX][1] = noc_req_out [gx+1 + gy*NumGroupsX][3]; + assign noc_req_in_valid[gx + gy*NumGroupsX][1] = noc_req_out_valid[gx+1 + gy*NumGroupsX][3]; + assign noc_req_out_ready[gx+1 + gy*NumGroupsX][3] = noc_req_in_ready[gx + gy*NumGroupsX][1]; + assign noc_rsp_in [gx + gy*NumGroupsX][1] = noc_rsp_out [gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_in_valid[gx + gy*NumGroupsX][1] = noc_rsp_out_valid[gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_out_ready[gx+1 + gy*NumGroupsX][3] = noc_rsp_in_ready[gx + gy*NumGroupsX][1]; end - - end else begin : gen_tile - cachepool_tile #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NrCores ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .TileIDWidth ( 1 ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_tile ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error_o ), - .debug_req_i ( debug_req_i ), - .meip_i ( meip_i ), - .mtip_i ( mtip_i ), - .msip_i ( msip_i ), - .hart_base_id_i ( hart_base_id_i ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .tile_id_i ( '0 ), - .private_start_addr_i ( private_start_addr ), - .axi_out_req_o ( axi_out_req [0] ), - .axi_out_resp_i ( axi_out_resp [0] ), - // Remote Ports (not used) - .remote_req_o ( ), - .remote_req_dst_o ( ), - .remote_rsp_i ( '0 ), - .remote_rsp_ready_i ( '0 ), - .remote_req_i ( '0 ), - .remote_rsp_o ( ), - .remote_rsp_ready_o ( ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req ), - .cache_refill_rsp_i ( cache_refill_rsp ), - .axi_wide_req_o ( axi_tile_req[0] ), - .axi_wide_rsp_i ( axi_tile_rsp[0] ), - // Peripherals - .icache_events_o ( icache_events ), - .icache_prefetch_enable_i ( icache_prefetch_enable ), - .cl_interrupt_i ( cl_interrupt ), - .dynamic_offset_i ( dynamic_offset ), - .l1d_private_i ( l1d_private ), - .l1d_insn_i ( l1d_insn ), - .l1d_insn_valid_i ( l1d_insn_valid ), - .l1d_insn_ready_o ( l1d_insn_ready ), - .l1d_busy_i ( l1d_busy ) - ); - - axi_to_reqrsp #( - .axi_req_t ( axi_mst_cache_req_t ), - .axi_rsp_t ( axi_mst_cache_resp_t ), - .AddrWidth ( AxiAddrWidth ), - .DataWidth ( AxiDataWidth ), - .UserWidth ( $bits(refill_user_t) ), - .IdWidth ( AxiIdWidthIn ), - .BufDepth ( NumSpatzOutstandingLoads ), - .reqrsp_req_t ( cache_trans_req_t ), - .reqrsp_rsp_t ( cache_trans_rsp_t ) - ) i_axi2reqrsp ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .busy_o ( ), - .axi_req_i ( axi_tile_req [0][TileMem] ), - .axi_rsp_o ( axi_tile_rsp [0][TileMem] ), - .reqrsp_req_o ( cache_core_req[0] ), - .reqrsp_rsp_i ( cache_core_rsp[0] ) - ); end - // Additional one port for iCache connection - localparam int unsigned ReqrspPortsTile = NumL1CtrlTile + 1; - always_comb begin - for (int t = 0; t < NumTiles; t++) begin - for (int p = 0; p < ReqrspPortsTile; p++) begin - automatic int unsigned xbar_idx = t*ReqrspPortsTile + p; - automatic int unsigned refill_idx = t*NumL1CtrlTile + p-1; - - if (p == 0) begin - // connect_icache_path - tile_req_chan [xbar_idx] = cache_core_req [t].q; - // Scrmable address - tile_req_chan [xbar_idx].addr = scrambleAddr(cache_core_req[t].q.addr); - tile_req_valid [xbar_idx] = cache_core_req [t].q_valid; - cache_core_rsp [t].q_ready = tile_req_ready [xbar_idx]; - - cache_core_rsp [t].p = tile_rsp_chan [xbar_idx]; - cache_core_rsp [t].p_valid = tile_rsp_valid [xbar_idx]; - tile_rsp_ready [xbar_idx] = cache_core_req [t].p_ready; - // Tile ID assignment - tile_req_chan [xbar_idx].user.tile_id = t; - end else begin - // connect_refill_path - tile_req_chan [xbar_idx] = cache_refill_req[refill_idx].q; - // Scramble address - tile_req_chan [xbar_idx].addr = scrambleAddr(cache_refill_req[refill_idx].q.addr); - tile_req_valid [xbar_idx] = cache_refill_req[refill_idx].q_valid; - cache_refill_rsp[refill_idx].q_ready = tile_req_ready [xbar_idx]; - - cache_refill_rsp[refill_idx].p = tile_rsp_chan [xbar_idx]; - cache_refill_rsp[refill_idx].p_valid = tile_rsp_valid [xbar_idx]; - tile_rsp_ready [xbar_idx] = cache_refill_req[refill_idx].p_ready; - // Tile ID assignment - tile_req_chan [xbar_idx].user.tile_id = t; - end - end + // North-South (vertical) interior connections + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_ns_conn + for (genvar gy = 0; gy < NumGroupsY-1; gy++) begin : gen_ns_conn_y + // North output of (gx,gy) (dir 0) → South input of (gx,gy+1) (dir 2) + assign noc_req_in [gx + (gy+1)*NumGroupsX][2] = noc_req_out [gx + gy*NumGroupsX][0]; + assign noc_req_in_valid[gx + (gy+1)*NumGroupsX][2] = noc_req_out_valid[gx + gy*NumGroupsX][0]; + assign noc_req_out_ready[gx + gy *NumGroupsX][0] = noc_req_in_ready[gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_in [gx + (gy+1)*NumGroupsX][2] = noc_rsp_out [gx + gy*NumGroupsX][0]; + assign noc_rsp_in_valid[gx + (gy+1)*NumGroupsX][2] = noc_rsp_out_valid[gx + gy*NumGroupsX][0]; + assign noc_rsp_out_ready[gx + gy *NumGroupsX][0] = noc_rsp_in_ready[gx + (gy+1)*NumGroupsX][2]; + // South output of (gx,gy+1) (dir 2) → North input of (gx,gy) (dir 0) + assign noc_req_in [gx + gy *NumGroupsX][0] = noc_req_out [gx + (gy+1)*NumGroupsX][2]; + assign noc_req_in_valid[gx + gy *NumGroupsX][0] = noc_req_out_valid[gx + (gy+1)*NumGroupsX][2]; + assign noc_req_out_ready[gx + (gy+1)*NumGroupsX][2] = noc_req_in_ready[gx + gy *NumGroupsX][0]; + assign noc_rsp_in [gx + gy *NumGroupsX][0] = noc_rsp_out [gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_in_valid[gx + gy *NumGroupsX][0] = noc_rsp_out_valid[gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_out_ready[gx + (gy+1)*NumGroupsX][2] = noc_rsp_in_ready[gx + gy *NumGroupsX][0]; end end - typedef struct packed { - int unsigned idx; - logic [AxiAddrWidth-1:0] base; - logic [AxiAddrWidth-1:0] mask; - } reqrsp_rule_t; - - reqrsp_rule_t [ClusterWideOutAxiPorts-1:0] xbar_rule; - - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - assign xbar_rule[i] = '{ - idx : i, - base : DramAddr + DramPerChSize * i, - mask : ({AxiAddrWidth{1'b1}} << $clog2(DramPerChSize)) - }; + // West boundary: gx=0 has no West neighbor (dir 3) + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_west_bnd + assign noc_req_in [gy*NumGroupsX][3] = '0; + assign noc_req_in_valid[gy*NumGroupsX][3] = '0; + assign noc_req_out_ready[gy*NumGroupsX][3] = '1; + assign noc_rsp_in [gy*NumGroupsX][3] = '0; + assign noc_rsp_in_valid[gy*NumGroupsX][3] = '0; + assign noc_rsp_out_ready[gy*NumGroupsX][3] = '1; end - logic [$clog2(ClusterWideOutAxiPorts):0] default_idx; - assign default_idx = ClusterWideOutAxiPorts; - - for (genvar inp = 0; inp < NumClusterMst*NumTiles; inp ++) begin : gen_xbar_sel - addr_decode_napot #( - .NoIndices (ClusterWideOutAxiPorts+1 ), - .NoRules (ClusterWideOutAxiPorts ), - .addr_t (axi_addr_t ), - .rule_t (reqrsp_rule_t ) - ) i_snitch_decode_napot ( - .addr_i (tile_req_chan[inp].addr), - .addr_map_i (xbar_rule ), - .idx_o (tile_sel_err[inp] ), - .dec_valid_o (/* Unused */ ), - .dec_error_o (/* Unused */ ), - .en_default_idx_i (1'b1 ), - .default_idx_i (default_idx ) - ); + // East boundary: gx=NumGroupsX-1 has no East neighbor (dir 1) + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_east_bnd + assign noc_req_in [(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_req_in_valid[(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_req_out_ready[(NumGroupsX-1) + gy*NumGroupsX][1] = '1; + assign noc_rsp_in [(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_rsp_in_valid[(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_rsp_out_ready[(NumGroupsX-1) + gy*NumGroupsX][1] = '1; + end - assign tile_sel[inp] = tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)-1:0]; + // South boundary: gy=0 has no South neighbor (dir 2) + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_south_bnd + assign noc_req_in [gx][2] = '0; + assign noc_req_in_valid[gx][2] = '0; + assign noc_req_out_ready[gx][2] = '1; + assign noc_rsp_in [gx][2] = '0; + assign noc_rsp_in_valid[gx][2] = '0; + assign noc_rsp_out_ready[gx][2] = '1; + end -`ifndef TARGET_SYNTHESIS - // Alert the system that we have illegal memory access - IllegalMemAccess : assert property( - @(posedge clk_i) disable iff (!rst_ni) (tile_req_valid[inp] |-> !tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)])) - else $error("Visited illegal address: time=%0t, port=%0d, addr=0x%08h", $time, inp, tile_req_chan[inp].addr); - // else $fatal (1, "Visited address is not mapped"); -`endif + // North boundary: gy=NumGroupsY-1 has no North neighbor (dir 0) + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_north_bnd + assign noc_req_in [gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_req_in_valid[gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_req_out_ready[gx + (NumGroupsY-1)*NumGroupsX][0] = '1; + assign noc_rsp_in [gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_rsp_in_valid[gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_rsp_out_ready[gx + (NumGroupsY-1)*NumGroupsX][0] = '1; end - reqrsp_xbar #( - .NumInp (NumClusterMst*NumTiles ), - .NumOut (ClusterWideOutAxiPorts ), - .PipeReg (1'b1 ), - .ExtReqPrio (1'b0 ), - .ExtRspPrio (Burst_Enable ), - .tcdm_req_chan_t (cache_trans_req_chan_t ), - .tcdm_rsp_chan_t (cache_trans_rsp_chan_t ) - ) i_cluster_xbar ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .slv_req_i (tile_req_chan ), - .slv_req_valid_i (tile_req_valid ), - .slv_req_ready_o (tile_req_ready ), - .slv_rsp_o (tile_rsp_chan ), - .slv_rsp_valid_o (tile_rsp_valid ), - .slv_rsp_ready_i (tile_rsp_ready ), - .slv_sel_i (tile_sel[NumTiles*NumClusterMst-1:0] ), - .slv_rr_i ('0 ), - .slv_selected_o (tile_selected ), - .mst_req_o (l2_req_chan ), - .mst_req_valid_o (l2_req_valid ), - .mst_req_ready_i (l2_req_ready ), - .mst_rsp_i (l2_rsp_chan ), - .mst_rr_i (l2_rsp_rr ), - .mst_rsp_valid_i (l2_rsp_valid ), - .mst_rsp_ready_o (l2_rsp_ready ), - .mst_sel_i (l2_sel ) - ); + // ------------- + // To Main Memory: reqrsp_to_axi per group, then axi_mux across groups + // ------------- - for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin - // To L2 Channels - always_comb begin - l2_req[ch].q = '{ - addr : l2_req_chan[ch].addr, - write: l2_req_chan[ch].write, - amo : l2_req_chan[ch].amo, - data : l2_req_chan[ch].data, - strb : l2_req_chan[ch].strb, - size : l2_req_chan[ch].size, - default: '0 - }; - l2_req[ch].q.user = l2_req_chan[ch].user; - l2_req[ch].q_valid = l2_req_valid[ch] ; - l2_req_ready[ch] = l2_rsp[ch].q_ready; - - l2_rsp_chan [ch] = '{ - data : l2_rsp[ch].p.data, - error: l2_rsp[ch].p.error, - write: l2_rsp[ch].p.write, - default: '0 - }; - l2_rsp_chan [ch].user = l2_rsp[ch].p.user; - l2_rsp_valid[ch] = l2_rsp[ch].p_valid; - l2_req[ch].p_ready = l2_rsp_ready[ch]; - // calculate the port from the tile id and bank id - // bank_id == 0 --- bypass - // bank_id == 1-4 --- cache bank 0-3 - l2_sel[ch] = l2_rsp[ch].p.user.tile_id * NumClusterMst + l2_rsp[ch].p.user.bank_id; + // Step 1: Per-group reqrsp_to_axi conversion. + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_per_group_l2 + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_per_group_l2 + localparam int unsigned g = gy * NumGroupsX + gx; + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_per_ch + reqrsp_to_axi #( + .MaxTrans ( NumSpatzOutstandingLoads*2 ), + .ID ( '0 ), + .EnBurst ( 1 ), + .ShuffleId ( 1 ), + .UserWidth ( $bits(refill_user_t) ), + .ReqUserFallThrough ( 1'b0 ), + .DataWidth ( AxiDataWidth ), + .AxiUserWidth ( AxiUserWidth ), + .reqrsp_req_t ( l2_req_t ), + .reqrsp_rsp_t ( l2_rsp_t ), + .axi_req_t ( axi_premux_cache_req_t ), + .axi_rsp_t ( axi_premux_cache_resp_t ) + ) i_reqrsp2axi ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .user_i ( l2_req[g][ch].q.user ), + .reqrsp_req_i ( l2_req[g][ch] ), + .reqrsp_rsp_o ( l2_rsp[g][ch] ), + .axi_req_o ( wide_axi_premux_req[g][ch] ), + .axi_rsp_i ( wide_axi_premux_rsp[g][ch] ) + ); + end end end - for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch ++) begin : gen_output_axi - reqrsp_to_axi #( - .MaxTrans (NumSpatzOutstandingLoads*2 ), - .ID ('0 ), - .EnBurst (1 ), - .ShuffleId (1 ), - .UserWidth ($bits(refill_user_t) ), - .ReqUserFallThrough (1'b0 ), - .DataWidth (AxiDataWidth ), - .AxiUserWidth (AxiUserWidth ), - .reqrsp_req_t (l2_req_t ), - .reqrsp_rsp_t (l2_rsp_t ), - .axi_req_t (axi_slv_cache_req_t ), - .axi_rsp_t (axi_slv_cache_resp_t ) - ) i_reqrsp2axi ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .user_i (l2_req[ch].q.user ), - .reqrsp_req_i (l2_req[ch] ), - .reqrsp_rsp_o (l2_rsp[ch] ), - .axi_req_o (wide_axi_slv_req[ch] ), - .axi_rsp_i (wide_axi_slv_rsp[ch] ) - ); - end + // Step 2: Per-L2-channel axi_mux across groups. + if (NumGroups > 1) begin : gen_l2_group_mux + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_ch_mux + // Per-group ID remapper: reduces GroupAxiIdOutWidth to WideRefillIdWidth before the mux. + // axi_id_remap preserves ID independence (unlike axi_id_serialize) for performance. + // AxiSlvPortMaxUniqIds = NumSpatzOutstandingLoads*2 matches the reqrsp_to_axi MaxTrans + // so the remapper never stalls. + for (genvar g = 0; g < NumGroups; g++) begin : gen_l2_mux_remap + axi_id_remap #( + .AxiSlvPortIdWidth ( GroupAxiIdOutWidth ), + .AxiSlvPortMaxUniqIds ( NumSpatzOutstandingLoads * 2 ), + .AxiMaxTxnsPerId ( NumSpatzOutstandingLoads ), + .AxiMstPortIdWidth ( WideIdWidthPreMux ), + .slv_req_t ( axi_premux_cache_req_t ), + .slv_resp_t ( axi_premux_cache_resp_t ), + .mst_req_t ( axi_remap_cache_req_t ), + .mst_resp_t ( axi_remap_cache_resp_t ) + ) i_l2_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( wide_axi_premux_req[g][ch] ), + .slv_resp_o ( wide_axi_premux_rsp[g][ch] ), + .mst_req_o ( wide_axi_remap_req[g][ch] ), + .mst_resp_i ( wide_axi_remap_rsp[g][ch] ) + ); + end + // Collect remapped per-group inputs for the mux. + axi_remap_cache_req_t [NumGroups-1:0] l2_mux_slv_req; + axi_remap_cache_resp_t [NumGroups-1:0] l2_mux_slv_rsp; + + for (genvar g = 0; g < NumGroups; g++) begin : gen_l2_mux_connect + assign l2_mux_slv_req[g] = wide_axi_remap_req[g][ch]; + assign wide_axi_remap_rsp[g][ch] = l2_mux_slv_rsp[g]; + end + + axi_mux #( + .SlvAxiIDWidth ( WideIdWidthPreMux ), + .slv_aw_chan_t ( axi_remap_cache_aw_chan_t ), + .mst_aw_chan_t ( axi_slv_cache_aw_chan_t ), + .w_chan_t ( axi_slv_cache_w_chan_t ), + .slv_b_chan_t ( axi_remap_cache_b_chan_t ), + .mst_b_chan_t ( axi_slv_cache_b_chan_t ), + .slv_ar_chan_t ( axi_remap_cache_ar_chan_t ), + .mst_ar_chan_t ( axi_slv_cache_ar_chan_t ), + .slv_r_chan_t ( axi_remap_cache_r_chan_t ), + .mst_r_chan_t ( axi_slv_cache_r_chan_t ), + .slv_req_t ( axi_remap_cache_req_t ), + .slv_resp_t ( axi_remap_cache_resp_t ), + .mst_req_t ( axi_slv_cache_req_t ), + .mst_resp_t ( axi_slv_cache_resp_t ), + .NoSlvPorts ( NumGroups ), + .FallThrough ( 0 ), + .SpillAw ( XbarLatency[4] ), + .SpillW ( XbarLatency[3] ), + .SpillB ( XbarLatency[2] ), + .SpillAr ( XbarLatency[1] ), + .SpillR ( XbarLatency[0] ), + .MaxWTrans ( 2 ) + ) i_axi_l2_mux ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), + .slv_reqs_i ( l2_mux_slv_req ), + .slv_resps_o ( l2_mux_slv_rsp ), + .mst_req_o ( wide_axi_slv_req[ch] ), + .mst_resp_i ( wide_axi_slv_rsp[ch] ) + ); + end + end else begin : gen_l2_no_mux + // Single group: direct connection, no mux needed. + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_ch_direct + assign wide_axi_slv_req[ch] = wide_axi_premux_req[0][ch]; + assign wide_axi_premux_rsp[0][ch] = wide_axi_slv_rsp[ch]; + end + end - // ------------- - // To Main Memory - // ------------- // Optionally decouple the external wide AXI master port. for (genvar port = 0; port < ClusterWideOutAxiPorts; port ++) begin : gen_axi_out_cut axi_cut #( @@ -775,20 +551,20 @@ module cachepool_cluster axi_mux #( .SlvAxiIDWidth ( CsrAxiMstIdWidth ), - .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), // AW Channel Type, slave ports - .mst_aw_chan_t ( axi_uart_aw_chan_t ), // AW Channel Type, master port - .w_chan_t ( axi_uart_w_chan_t ), // W Channel Type, all ports - .slv_b_chan_t ( axi_csr_mst_b_chan_t ), // B Channel Type, slave ports - .mst_b_chan_t ( axi_uart_b_chan_t ), // B Channel Type, master port - .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), // AR Channel Type, slave ports - .mst_ar_chan_t ( axi_uart_ar_chan_t ), // AR Channel Type, master port - .slv_r_chan_t ( axi_csr_mst_r_chan_t ), // R Channel Type, slave ports - .mst_r_chan_t ( axi_uart_r_chan_t ), // R Channel Type, master port + .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), + .mst_aw_chan_t ( axi_uart_aw_chan_t ), + .w_chan_t ( axi_uart_w_chan_t ), + .slv_b_chan_t ( axi_csr_mst_b_chan_t ), + .mst_b_chan_t ( axi_uart_b_chan_t ), + .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), + .mst_ar_chan_t ( axi_uart_ar_chan_t ), + .slv_r_chan_t ( axi_csr_mst_r_chan_t ), + .mst_r_chan_t ( axi_uart_r_chan_t ), .slv_req_t ( axi_csr_mst_req_t ), .slv_resp_t ( axi_csr_mst_resp_t ), .mst_req_t ( axi_uart_req_t ), .mst_resp_t ( axi_uart_resp_t ), - .NoSlvPorts ( NumTiles ), // Number of Masters for the module + .NoSlvPorts ( NumTiles ), .FallThrough ( 0 ), .SpillAw ( XbarLatency[4] ), .SpillW ( XbarLatency[3] ), @@ -797,9 +573,9 @@ module cachepool_cluster .SpillR ( XbarLatency[0] ), .MaxWTrans ( 2 ) ) i_axi_uart_mux ( - .clk_i ( clk_i ), // Clock - .rst_ni ( rst_ni ), // Asynchronous reset active low - .test_i ( '0 ), // Test Mode enable + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), .slv_reqs_i ( axi_uart_mux_req ), .slv_resps_o ( axi_uart_mux_rsp ), .mst_req_o ( axi_narrow_req_o ), @@ -810,43 +586,6 @@ module cachepool_cluster assign axi_out_resp[0][ClusterUart] = axi_narrow_resp_i; end - /***** BootROM ****/ - for (genvar t = 0; t < NumTiles; t++) begin : gen_bootrom - axi_to_reg #( - .ADDR_WIDTH (AxiAddrWidth ), - .DATA_WIDTH (AxiDataWidth ), - .AXI_MAX_WRITE_TXNS (1 ), - .AXI_MAX_READ_TXNS (1 ), - .DECOUPLE_W (0 ), - .ID_WIDTH (WideIdWidthIn ), - .USER_WIDTH (AxiUserWidth ), - .axi_req_t (axi_mst_cache_req_t ), - .axi_rsp_t (axi_mst_cache_resp_t), - .reg_req_t (reg_cache_req_t ), - .reg_rsp_t (reg_cache_rsp_t ) - ) i_axi_to_reg_bootrom ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (1'b0 ), - .axi_req_i (axi_tile_req[t][TileBootROM] ), - .axi_rsp_o (axi_tile_rsp[t][TileBootROM] ), - .reg_req_o (bootrom_reg_req[t] ), - .reg_rsp_i (bootrom_reg_rsp[t] ) - ); - - bootrom i_bootrom ( - .clk_i (clk_i ), - .req_i (bootrom_reg_req[t].valid ), - .addr_i (addr_t'(bootrom_reg_req[t].addr) ), - .rdata_o(bootrom_reg_rsp[t].rdata ) - ); - - `FF(bootrom_reg_rsp[t].ready, bootrom_reg_req[t].valid, 1'b0) - - assign bootrom_reg_rsp[t].error = 1'b0; - end - - /***** CSR/Peripherals *****/ `REG_BUS_TYPEDEF_ALL(reg, narrow_addr_t, narrow_data_t, narrow_strb_t) @@ -860,6 +599,11 @@ module cachepool_cluster axi_narrow_req_t [NumTiles-1:0] axi_core_csr_req, axi_barrier_req; axi_narrow_resp_t [NumTiles-1:0] axi_core_csr_rsp, axi_barrier_rsp; + // Serialized CSR signals: one entry per tile plus one for the external axi_in port. + // Index [NumTiles] = axi_in_req_i, indices [NumTiles-1:0] = per-tile CSR outputs. + axi_csr_ser_req_t [NumTiles:0] axi_csr_pre_mux_req; + axi_csr_ser_resp_t [NumTiles:0] axi_csr_pre_mux_rsp; + for (genvar t = 0; t < NumTiles; t++) begin assign axi_barrier_req[t] = axi_out_req [t][ClusterPeriph]; @@ -895,23 +639,81 @@ module cachepool_cluster .cluster_periph_start_address_i ( tcdm_end_address ) ); + // Per-tile CSR ID serializers: reduce CsrAxiMstIdWidth to CsrSerIdWidth before the mux + // so the mux output stays bounded regardless of NumTiles. + for (genvar t = 0; t < NumTiles; t++) begin : gen_csr_id_serialize + axi_id_serialize #( + .AxiSlvPortIdWidth ( CsrAxiMstIdWidth ), + .AxiSlvPortMaxTxns ( 2 ), + .AxiMstPortIdWidth ( CsrSerIdWidth ), + .AxiMstPortMaxUniqIds ( 1 ), + .AxiMstPortMaxTxnsPerId ( 2 ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( SpatzAxiNarrowDataWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .AtopSupport ( 1'b0 ), + .slv_req_t ( axi_narrow_req_t ), + .slv_resp_t ( axi_narrow_resp_t ), + .mst_req_t ( axi_csr_ser_req_t ), + .mst_resp_t ( axi_csr_ser_resp_t ), + // Provide one dummy entry to avoid [IdMapNumEntries-1:0] underflow when 0. + // Entry maps ID 0 -> 0, which is identical to the default modulo formula. + .IdMapNumEntries ( 1 ), + .IdMap ( '{'{32'd0, 32'd0}} ) + ) i_csr_id_serialize ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_core_csr_req[t] ), + .slv_resp_o ( axi_core_csr_rsp[t] ), + .mst_req_o ( axi_csr_pre_mux_req[t] ), + .mst_resp_i ( axi_csr_pre_mux_rsp[t] ) + ); + end + + // Serializer for the external axi_in port (SoC CSR access). + axi_id_serialize #( + .AxiSlvPortIdWidth ( AxiIdWidthIn ), + .AxiSlvPortMaxTxns ( 2 ), + .AxiMstPortIdWidth ( CsrSerIdWidth ), + .AxiMstPortMaxUniqIds ( 1 ), + .AxiMstPortMaxTxnsPerId ( 2 ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( SpatzAxiNarrowDataWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .AtopSupport ( 1'b0 ), + .slv_req_t ( axi_in_req_t ), + .slv_resp_t ( axi_in_resp_t ), + .mst_req_t ( axi_csr_ser_req_t ), + .mst_resp_t ( axi_csr_ser_resp_t ), + // Provide one dummy entry to avoid [IdMapNumEntries-1:0] underflow when 0. + // Entry maps ID 0 -> 0, which is identical to the default modulo formula. + .IdMapNumEntries ( 1 ), + .IdMap ( '{'{32'd0, 32'd0}} ) + ) i_csr_in_id_serialize ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_in_req_i ), + .slv_resp_o ( axi_in_resp_o ), + .mst_req_o ( axi_csr_pre_mux_req[NumTiles] ), + .mst_resp_i ( axi_csr_pre_mux_rsp[NumTiles] ) + ); axi_mux #( - .SlvAxiIDWidth ( CsrAxiMstIdWidth ), - .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), // AW Channel Type, slave ports - .mst_aw_chan_t ( axi_csr_slv_aw_chan_t ), // AW Channel Type, master port - .w_chan_t ( axi_csr_slv_w_chan_t ), // W Channel Type, all ports - .slv_b_chan_t ( axi_csr_mst_b_chan_t ), // B Channel Type, slave ports - .mst_b_chan_t ( axi_csr_slv_b_chan_t ), // B Channel Type, master port - .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), // AR Channel Type, slave ports - .mst_ar_chan_t ( axi_csr_slv_ar_chan_t ), // AR Channel Type, master port - .slv_r_chan_t ( axi_csr_mst_r_chan_t ), // R Channel Type, slave ports - .mst_r_chan_t ( axi_csr_slv_r_chan_t ), // R Channel Type, master port - .slv_req_t ( axi_csr_mst_req_t ), - .slv_resp_t ( axi_csr_mst_resp_t ), + .SlvAxiIDWidth ( CsrSerIdWidth ), + .slv_aw_chan_t ( axi_csr_ser_aw_chan_t ), + .mst_aw_chan_t ( axi_csr_slv_aw_chan_t ), + .w_chan_t ( axi_csr_slv_w_chan_t ), + .slv_b_chan_t ( axi_csr_ser_b_chan_t ), + .mst_b_chan_t ( axi_csr_slv_b_chan_t ), + .slv_ar_chan_t ( axi_csr_ser_ar_chan_t ), + .mst_ar_chan_t ( axi_csr_slv_ar_chan_t ), + .slv_r_chan_t ( axi_csr_ser_r_chan_t ), + .mst_r_chan_t ( axi_csr_slv_r_chan_t ), + .slv_req_t ( axi_csr_ser_req_t ), + .slv_resp_t ( axi_csr_ser_resp_t ), .mst_req_t ( axi_csr_slv_req_t ), .mst_resp_t ( axi_csr_slv_resp_t ), - .NoSlvPorts ( NumTiles + 1 ), // Number of Masters for the module + .NoSlvPorts ( NumTiles + 1 ), .FallThrough ( 0 ), .SpillAw ( XbarLatency[4] ), .SpillW ( XbarLatency[3] ), @@ -920,13 +722,13 @@ module cachepool_cluster .SpillR ( XbarLatency[0] ), .MaxWTrans ( 2 ) ) i_axi_csr_mux ( - .clk_i ( clk_i ), // Clock - .rst_ni ( rst_ni ), // Asynchronous reset active low - .test_i ('0 ), // Test Mode enable - .slv_reqs_i ( {axi_in_req_i, axi_core_csr_req} ), - .slv_resps_o ( {axi_in_resp_o, axi_core_csr_rsp} ), - .mst_req_o ( axi_csr_req ), - .mst_resp_i ( axi_csr_rsp ) + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ('0 ), + .slv_reqs_i ( axi_csr_pre_mux_req ), + .slv_resps_o ( axi_csr_pre_mux_rsp ), + .mst_req_o ( axi_csr_req ), + .mst_resp_i ( axi_csr_rsp ) ); axi_to_reg #( @@ -942,39 +744,15 @@ module cachepool_cluster .reg_req_t (reg_req_t ), .reg_rsp_t (reg_rsp_t ) ) i_csr_axi_to_reg ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (1'b0 ), - .axi_req_i (axi_csr_req ), - .axi_rsp_o (axi_csr_rsp ), - .reg_req_o (reg_req ), - .reg_rsp_i (reg_rsp ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .axi_req_i (axi_csr_req ), + .axi_rsp_o (axi_csr_rsp ), + .reg_req_o (reg_req ), + .reg_rsp_i (reg_rsp ) ); - - // Event counter increments for the TCDM. - typedef struct packed { - /// Number requests going in - logic [$clog2(5):0] inc_accessed; - /// Number of requests stalled due to congestion - logic [$clog2(5):0] inc_congested; - } tcdm_events_t; - - // Event counter increments for DMA. - typedef struct packed { - logic aw_stall, ar_stall, r_stall, w_stall, - buf_w_stall, buf_r_stall; - logic aw_valid, aw_ready, aw_done, aw_bw; - logic ar_valid, ar_ready, ar_done, ar_bw; - logic r_valid, r_ready, r_done, r_bw; - logic w_valid, w_ready, w_done, w_bw; - logic b_valid, b_ready, b_done; - logic dma_busy; - axi_pkg::len_t aw_len, ar_len; - axi_pkg::size_t aw_size, ar_size; - logic [$clog2(SpatzAxiNarrowDataWidth/8):0] num_bytes_written; - } dma_events_t; - cachepool_peripheral #( .AddrWidth (AxiAddrWidth ), .SPMWidth ($clog2(L1NumSet)), diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index b14d1ac..1e6e5d4 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -4,19 +4,9 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" -`include "common_cells/assertions.svh" `include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" `include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" -`include "reqrsp_interface/typedef.svh" -`include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" /// Group implementation for CachePool module cachepool_group @@ -24,7 +14,7 @@ module cachepool_group import spatz_pkg::*; import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; - import snitch_icache_pkg::icache_events_t; + import snitch_icache_pkg::icache_l1_events_t; #( /// Width of physical address. parameter int unsigned AxiAddrWidth = 48, @@ -48,10 +38,6 @@ module cachepool_group parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -65,8 +51,6 @@ module cachepool_group /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = 0, @@ -107,7 +91,12 @@ module cachepool_group parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ - parameter int unsigned NrSramCfg = 1 + parameter int unsigned NrSramCfg = 1, + + localparam int unsigned TotRGPorts = (NumRemoteGroupPortCore == 0) ? 0 : + NumTilesPerGroup*NumRemoteGroupPortCore*NrTCDMPortsPerCore-1, + localparam int unsigned NumRemoteGroupPortTile = (NumRemoteGroupPortCore == 0) ? 1 : + NumRemoteGroupPortCore * NrTCDMPortsPerCore ) ( /// System clock. input logic clk_i, @@ -115,48 +104,60 @@ module cachepool_group input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, + input logic debug_req_i, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. input logic [9:0] hart_base_id_i, + /// Globally-unique tile ID of the first tile in this group (= group_index * NumTilesPerGroup). + input logic [TileIDWidth-1:0] tile_base_id_i, /// Base address of cluster. TCDM and cluster peripheral location are derived from /// it. This signal is pseudo-static. input axi_addr_t cluster_base_addr_i, /// Partitioning address input axi_addr_t private_start_addr_i, /// AXI Narrow out-port (UART/Peripheral) - output axi_narrow_req_t [GroupNarrowAxiPorts-1:0] axi_narrow_req_o, - input axi_narrow_resp_t [GroupNarrowAxiPorts-1:0] axi_narrow_rsp_i, - /// Wide AXI ports to cluster level - output axi_out_req_t [GroupWideAxiPorts-1:0] axi_wide_req_o, - input axi_out_resp_t [GroupWideAxiPorts-1:0] axi_wide_rsp_i, + output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, + input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, - /// Cache refill ports - output cache_trans_req_t [NumL1CacheCtrl-1:0] cache_refill_req_o, - input cache_trans_rsp_t [NumL1CacheCtrl-1:0] cache_refill_rsp_i, + /// DRAM refill reqrsp ports (post-xbar, one per L2 channel) + output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, + input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, /// Peripheral signals - output icache_events_t [NrCores-1:0] icache_events_o, + output icache_l1_events_t [NrCores-1:0] icache_events_o, input logic icache_prefetch_enable_i, input logic [NrCores-1:0] cl_interrupt_i, input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, input logic [3:0] l1d_private_i, - input cache_insn_t l1d_insn_i, + input cache_insn_t l1d_insn_i, input logic l1d_insn_valid_i, - output logic [NumTiles-1:0] l1d_insn_ready_o, - input logic [NumTiles-1:0] l1d_busy_i, + output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, + input logic [NumTilesPerGroup-1:0] l1d_busy_i, + + /// Inter-group remote access ports (to other groups). + /// Layout: [NumTilesPerGroup-1:0][NumRemoteGroupPortTile-1:0] flattened to + /// [NumTilesPerGroup * NumRemoteGroupPortTile - 1 : 0]. + /// Per-tile flat index: j + r * NrTCDMPortsPerCore (j = interco instance, + /// r = inter-group slot within that instance). + /// NumRemoteGroupPortTile = NumRemoteGroupPortCore * NrTCDMPortsPerCore. + /// Uses REQRSP-style types with built-in ready and remote_group_user_t. + output remote_group_req_t [TotRGPorts:0] remote_group_req_o, + input remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_i, + /// Inter-group remote access ports (from other groups) + input remote_group_req_t [TotRGPorts:0] remote_group_req_i, + output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, /// SRAM Configuration input impl_in_t [NrSramCfg-1:0] impl_i, @@ -173,17 +174,9 @@ module cachepool_group // --------- // Constants // --------- - /// Minimum width to hold the core number. - localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); - localparam int unsigned TileIDWidth = cf_math_pkg::idx_width(NumTiles); - - // Enlarge the address width for Spatz due to cache - localparam int unsigned TCDMAddrWidth = L1AddrWidth; + // Per-group overrides of package-level constants that depend on NumTiles/NumCores. + localparam int unsigned NumL1CacheCtrlLocal = NrCores; - // Core Request, SoC Request - localparam int unsigned NrNarrowMasters = 2; - - localparam int unsigned WideIdWidthOut = AxiIdWidthOut; localparam int unsigned WideIdWidthIn = AxiIdWidthOut; @@ -194,13 +187,9 @@ module cachepool_group typedef logic [AxiDataWidth-1:0] data_cache_t; typedef logic [AxiDataWidth/8-1:0] strb_cache_t; typedef logic [WideIdWidthIn-1:0] id_cache_mst_t; - typedef logic [WideIdWidthOut-1:0] id_cache_slv_t; typedef logic [AxiUserWidth-1:0] user_cache_t; `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) - `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) - - `REG_BUS_TYPEDEF_ALL(reg_cache, addr_t, data_cache_t, strb_cache_t) typedef struct packed { int unsigned idx; @@ -208,43 +197,472 @@ module cachepool_group addr_t end_addr; } xbar_rule_t; - `SNITCH_VM_TYPEDEF(AxiAddrWidth) - // --------------- // CachePool Tile // --------------- - logic [NumTiles-1:0] error; + logic [NumTilesPerGroup-1:0] error; assign error_o = |error; + // Internal tile-side wide AXI: split into two flat arrays by port function + // BootROM (TileBootROM=0): muxed into single shared bootrom in this group + axi_mst_cache_req_t [NumTilesPerGroup-1:0] axi_tile_bootrom_req; + axi_mst_cache_resp_t [NumTilesPerGroup-1:0] axi_tile_bootrom_rsp; + // TileMem (TileMem=1): stays in group, fed into axi_to_reqrsp + axi_mst_cache_req_t [NumTilesPerGroup-1:0] axi_tile_mem_req; + axi_mst_cache_resp_t [NumTilesPerGroup-1:0] axi_tile_mem_rsp; + + // Per-group bootrom mux AXI type: the mux prepends $clog2(NumTilesPerGroup) + // bits to the ID, not $clog2(NumTiles) as the package assumes. + localparam int unsigned LocalBootRomIdWidth = WideIdWidthIn + $clog2(NumTilesPerGroup); + typedef logic [LocalBootRomIdWidth-1:0] local_bootrom_id_t; + `AXI_TYPEDEF_ALL(local_bootrom, addr_t, local_bootrom_id_t, data_cache_t, strb_cache_t, user_cache_t) + + // Mux all per-tile BootROM AXI ports into a single bootrom instance + local_bootrom_req_t axi_bootrom_mux_req; + local_bootrom_resp_t axi_bootrom_mux_rsp; + + if (NumTilesPerGroup > 1) begin : gen_bootrom_mux + axi_mux #( + .SlvAxiIDWidth ( WideIdWidthIn ), + .slv_aw_chan_t ( axi_mst_cache_aw_chan_t ), + .mst_aw_chan_t ( local_bootrom_aw_chan_t ), + .w_chan_t ( axi_mst_cache_w_chan_t ), + .slv_b_chan_t ( axi_mst_cache_b_chan_t ), + .mst_b_chan_t ( local_bootrom_b_chan_t ), + .slv_ar_chan_t ( axi_mst_cache_ar_chan_t ), + .mst_ar_chan_t ( local_bootrom_ar_chan_t ), + .slv_r_chan_t ( axi_mst_cache_r_chan_t ), + .mst_r_chan_t ( local_bootrom_r_chan_t ), + .slv_req_t ( axi_mst_cache_req_t ), + .slv_resp_t ( axi_mst_cache_resp_t ), + .mst_req_t ( local_bootrom_req_t ), + .mst_resp_t ( local_bootrom_resp_t ), + .NoSlvPorts ( NumTilesPerGroup ), + .FallThrough ( 0 ), + .SpillAw ( XbarLatency[4] ), + .SpillW ( XbarLatency[3] ), + .SpillB ( XbarLatency[2] ), + .SpillAr ( XbarLatency[1] ), + .SpillR ( XbarLatency[0] ), + .MaxWTrans ( 2 ) + ) i_axi_bootrom_mux ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), + .slv_reqs_i ( axi_tile_bootrom_req ), + .slv_resps_o( axi_tile_bootrom_rsp ), + .mst_req_o ( axi_bootrom_mux_req ), + .mst_resp_i ( axi_bootrom_mux_rsp ) + ); + end else begin : gen_bootrom_connect + // NumTilesPerGroup==1: direct connect, no ID widening needed + assign axi_bootrom_mux_req = local_bootrom_req_t'(axi_tile_bootrom_req[0]); + assign axi_tile_bootrom_rsp[0] = axi_mst_cache_resp_t'(axi_bootrom_mux_rsp); + end + + // Single BootROM instance shared across all tiles in the group + `REG_BUS_TYPEDEF_ALL(reg_bootrom, addr_t, data_cache_t, strb_cache_t) + reg_bootrom_req_t bootrom_reg_req; + reg_bootrom_rsp_t bootrom_reg_rsp; + + axi_to_reg #( + .ADDR_WIDTH ( AxiAddrWidth ), + .DATA_WIDTH ( AxiDataWidth ), + .AXI_MAX_WRITE_TXNS ( 1 ), + .AXI_MAX_READ_TXNS ( 1 ), + .DECOUPLE_W ( 0 ), + .ID_WIDTH ( LocalBootRomIdWidth ), + .USER_WIDTH ( AxiUserWidth ), + .axi_req_t ( local_bootrom_req_t ), + .axi_rsp_t ( local_bootrom_resp_t ), + .reg_req_t ( reg_bootrom_req_t ), + .reg_rsp_t ( reg_bootrom_rsp_t ) + ) i_axi_to_reg_bootrom ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .testmode_i ( 1'b0 ), + .axi_req_i ( axi_bootrom_mux_req ), + .axi_rsp_o ( axi_bootrom_mux_rsp ), + .reg_req_o ( bootrom_reg_req ), + .reg_rsp_i ( bootrom_reg_rsp ) + ); + + bootrom i_bootrom ( + .clk_i ( clk_i ), + .req_i ( bootrom_reg_req.valid ), + .addr_i ( addr_t'(bootrom_reg_req.addr) ), + .rdata_o ( bootrom_reg_rsp.rdata ) + ); + + `FF(bootrom_reg_rsp.ready, bootrom_reg_req.valid, 1'b0) + assign bootrom_reg_rsp.error = 1'b0; + + // Cache refill ports from tiles (NumL1CacheCtrlLocal = NumCores total) + cache_trans_req_t [NumL1CacheCtrlLocal-1:0] cache_refill_req; + cache_trans_rsp_t [NumL1CacheCtrlLocal-1:0] cache_refill_rsp; + + // L2 Group ICache AXI master output (from axi_hier_interco) + axi_mst_cache_req_t axi_l2icache_mst_req; + axi_mst_cache_resp_t axi_l2icache_mst_rsp; + // L2 Group ICache reqrsp output (to xbar port 0) + cache_trans_req_t cache_l2icache_req; + cache_trans_rsp_t cache_l2icache_rsp; + // L2 Group ICache control (hardwired) + ro_cache_ctrl_t l2icache_ctrl; + + // Flat xbar input channels: NumTilesPerGroup * NumClusterMst ports + cache_trans_req_chan_t [NumTilesPerGroup*NumClusterMst-1:0] tile_req_chan; + cache_trans_rsp_chan_t [NumTilesPerGroup*NumClusterMst-1:0] tile_rsp_chan; + logic [NumTilesPerGroup*NumClusterMst-1:0] tile_req_valid, tile_req_ready, + tile_rsp_valid, tile_rsp_ready; + + // Xbar output channels: one per L2 channel + cache_trans_req_chan_t [ClusterWideOutAxiPorts-1:0] l2_req_chan; + cache_trans_rsp_chan_t [ClusterWideOutAxiPorts-1:0] l2_rsp_chan; + logic [ClusterWideOutAxiPorts-1:0] l2_req_valid, l2_req_ready, + l2_rsp_valid, l2_rsp_ready; + + // Selection types + typedef logic [$clog2(NumClusterMst*NumTilesPerGroup)-1:0] l2_sel_t; + typedef logic [$clog2(ClusterWideOutAxiPorts) :0] tile_sel_err_t; // one extra bit for OOB + typedef logic [$clog2(ClusterWideOutAxiPorts)-1:0] tile_sel_t; + + tile_sel_err_t [NumTilesPerGroup*NumClusterMst-1:0] tile_sel_err; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] tile_sel; + l2_sel_t [ClusterWideOutAxiPorts-1:0] l2_sel; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] l2_rsp_rr; + + logic [NumTilesPerGroup*NumClusterMst-1:0] rr_lock_d, rr_lock_q; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] l2_prio_d, l2_prio_q; + + // port_id: which xbar input port does each L2 channel response target + l2_sel_t [ClusterWideOutAxiPorts-1:0] port_id; + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign port_id[i] = l2_rsp_i[i].p.user.tile_id * NumClusterMst + + l2_rsp_i[i].p.user.bank_id; + end + + // --------------------- + // L2 Group ICache: 4-to-1 AXI mux + read-only cache + ID remap + // --------------------- + always_comb begin + l2icache_ctrl = '0; + l2icache_ctrl.enable = 1'b1; + l2icache_ctrl.flush_valid = 1'b0; + l2icache_ctrl.start_addr[0] = DramAddr; + l2icache_ctrl.end_addr[0] = DramAddr + DramSize; + end + + axi_hier_interco #( + .NumSlvPorts ( NumTilesPerGroup ), + .NumMstPorts ( 1 ), + .Radix ( NumTilesPerGroup ), + .EnableCache ( 1 ), + .CacheLineWidth ( L2ICacheLineWidth ), + .CacheSizeByte ( L2ICacheSizeByte ), + .CacheSets ( L2ICacheSets ), + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .SlvIdWidth ( WideIdWidthIn ), + .MstIdWidth ( WideIdWidthIn ), + .UserWidth ( AxiUserWidth ), + .slv_req_t ( axi_mst_cache_req_t ), + .slv_resp_t ( axi_mst_cache_resp_t ), + .mst_req_t ( axi_mst_cache_req_t ), + .mst_resp_t ( axi_mst_cache_resp_t ) + ) i_l2icache_interco ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( 1'b0 ), + .ro_cache_ctrl_i ( l2icache_ctrl ), + .slv_req_i ( axi_tile_mem_req ), + .slv_resp_o ( axi_tile_mem_rsp ), + .mst_req_o ( axi_l2icache_mst_req ), + .mst_resp_i ( axi_l2icache_mst_rsp ) + ); + + // Single axi_to_reqrsp for the L2 ICache master output + axi_to_reqrsp #( + .axi_req_t ( axi_mst_cache_req_t ), + .axi_rsp_t ( axi_mst_cache_resp_t ), + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .UserWidth ( $bits(refill_user_t) ), + .IdWidth ( WideIdWidthIn ), + .BufDepth ( NumSpatzOutstandingLoads ), + .reqrsp_req_t ( cache_trans_req_t ), + .reqrsp_rsp_t ( cache_trans_rsp_t ) + ) i_l2icache_axi2reqrsp ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .busy_o ( ), + .axi_req_i ( axi_l2icache_mst_req ), + .axi_rsp_o ( axi_l2icache_mst_rsp ), + .reqrsp_req_o ( cache_l2icache_req ), + .reqrsp_rsp_i ( cache_l2icache_rsp ) + ); + + // --------------------- + // Wiring: assemble flat xbar input from icache-bypass and refill paths + // --------------------- + // Port layout per tile: p=0 -> L2 ICache output (t=0) or unused (t>0), + // p=1..NumL1CtrlTile -> refill (cache_refill_req) + localparam int unsigned ReqrspPortsTile = NumL1CtrlTile + 1; + always_comb begin + for (int t = 0; t < NumTilesPerGroup; t++) begin + for (int p = 0; p < ReqrspPortsTile; p++) begin + automatic int unsigned xbar_idx = t * ReqrspPortsTile + p; + automatic int unsigned refill_idx = t * NumL1CtrlTile + p - 1; + + if (p == 0) begin + if (t == 0) begin + // L2 ICache output → xbar port 0 + tile_req_chan [xbar_idx] = cache_l2icache_req.q; + tile_req_chan [xbar_idx].addr = scrambleAddr(cache_l2icache_req.q.addr); + tile_req_valid [xbar_idx] = cache_l2icache_req.q_valid; + cache_l2icache_rsp.q_ready = tile_req_ready[xbar_idx]; + + cache_l2icache_rsp.p = tile_rsp_chan [xbar_idx]; + cache_l2icache_rsp.p_valid = tile_rsp_valid[xbar_idx]; + tile_rsp_ready [xbar_idx] = cache_l2icache_req.p_ready; + tile_req_chan [xbar_idx].user.tile_id = '0; + end else begin + // unused icache-bypass ports (tiles 1-3) + tile_req_chan [xbar_idx] = '0; + tile_req_valid [xbar_idx] = 1'b0; + tile_rsp_ready [xbar_idx] = 1'b0; + end + end else begin + // refill path + tile_req_chan [xbar_idx] = cache_refill_req[refill_idx].q; + tile_req_chan [xbar_idx].addr = scrambleAddr(cache_refill_req[refill_idx].q.addr); + tile_req_valid [xbar_idx] = cache_refill_req[refill_idx].q_valid; + cache_refill_rsp[refill_idx].q_ready = tile_req_ready[xbar_idx]; + + cache_refill_rsp[refill_idx].p = tile_rsp_chan [xbar_idx]; + cache_refill_rsp[refill_idx].p_valid = tile_rsp_valid[xbar_idx]; + tile_rsp_ready [xbar_idx] = cache_refill_req[refill_idx].p_ready; + tile_req_chan [xbar_idx].user.tile_id = t; + end + end + end + end + + // --------------------- + // Address decoder: select L2 channel per xbar input port + // --------------------- + typedef struct packed { + int unsigned idx; + logic [AxiAddrWidth-1:0] base; + logic [AxiAddrWidth-1:0] mask; + } reqrsp_rule_t; + + reqrsp_rule_t [ClusterWideOutAxiPorts-1:0] xbar_rule; + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign xbar_rule[i] = '{ + idx : i, + base : DramAddr + DramPerChSize * i, + mask : ({AxiAddrWidth{1'b1}} << $clog2(DramPerChSize)) + }; + end + + logic [$clog2(ClusterWideOutAxiPorts):0] default_idx; + assign default_idx = ClusterWideOutAxiPorts; + + for (genvar inp = 0; inp < NumClusterMst*NumTilesPerGroup; inp++) begin : gen_xbar_sel + addr_decode_napot #( + .NoIndices ( ClusterWideOutAxiPorts+1 ), + .NoRules ( ClusterWideOutAxiPorts ), + .addr_t ( axi_addr_t ), + .rule_t ( reqrsp_rule_t ) + ) i_snitch_decode_napot ( + .addr_i ( tile_req_chan[inp].addr ), + .addr_map_i ( xbar_rule ), + .idx_o ( tile_sel_err[inp] ), + .dec_valid_o ( /* unused */ ), + .dec_error_o ( /* unused */ ), + .en_default_idx_i ( 1'b1 ), + .default_idx_i ( default_idx ) + ); + assign tile_sel[inp] = tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)-1:0]; + +`ifndef TARGET_SYNTHESIS + IllegalMemAccess : assert property ( + @(posedge clk_i) disable iff (!rst_ni) + (tile_req_valid[inp] |-> !tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)])) + else $error("Visited illegal address: time=%0t, port=%0d, addr=0x%08h", + $time, inp, tile_req_chan[inp].addr); +`endif + end + + // --------------------- + // Burst protection logic + // --------------------- + if (Burst_Enable) begin : gen_burst_ext_sel + `FF(rr_lock_q, rr_lock_d, 1'b0) + `FF(l2_prio_q, l2_prio_d, 1'b0) + + for (genvar port = 0; port < NumTilesPerGroup*NumClusterMst; port++) begin : gen_rsp_rr + tile_sel_t l2_rr; + logic [ClusterWideOutAxiPorts-1:0] arb_valid; + + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign arb_valid[i] = (port_id[i] == port) & l2_rsp_valid[i]; + end + + always_comb begin + l2_prio_d[port] = l2_prio_q[port]; + rr_lock_d[port] = rr_lock_q[port]; + + if (|arb_valid) begin + if (rr_lock_q[port]) begin + l2_prio_d[port] = l2_prio_q[port]; + end else begin + l2_prio_d[port] = l2_rr; + end + end + l2_rsp_rr[port] = l2_prio_d[port]; + + if (tile_rsp_chan[port].user.burst.is_burst & |arb_valid) begin + if (tile_rsp_chan[port].user.burst.burst_len == 0) begin + rr_lock_d[port] = 1'b0; + end else begin + rr_lock_d[port] = 1'b1; + end + end + end + + rr_arb_tree #( + .NumIn ( ClusterWideOutAxiPorts ), + .DataType ( logic ), + .ExtPrio ( 1'b0 ), + .AxiVldRdy ( 1'b1 ), + .LockIn ( 1'b1 ) + ) i_rr_arb_tree ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .flush_i ( '0 ), + .rr_i ( '0 ), + .req_i ( arb_valid ), + .gnt_o ( /* not used */ ), + .data_i ( '0 ), + .req_o ( /* not used */ ), + .gnt_i ( tile_rsp_ready[port]), + .data_o ( /* not used */ ), + .idx_o ( l2_rr ) + ); + end + end else begin + assign l2_prio_d = '0; + assign l2_prio_q = '0; + assign rr_lock_d = '0; + assign rr_lock_q = '0; + assign l2_rsp_rr = '0; + end + + // --------------------- + // Refill (DRAM) xbar + // --------------------- + reqrsp_xbar #( + .NumInp ( NumClusterMst*NumTilesPerGroup ), + .NumOut ( ClusterWideOutAxiPorts ), + .PipeReg ( 1'b1 ), + .ExtReqPrio ( 1'b0 ), + .ExtRspPrio ( Burst_Enable ), + .tcdm_req_chan_t ( cache_trans_req_chan_t ), + .tcdm_rsp_chan_t ( cache_trans_rsp_chan_t ) + ) i_refill_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( tile_req_chan ), + .slv_req_valid_i ( tile_req_valid ), + .slv_req_ready_o ( tile_req_ready ), + .slv_rsp_o ( tile_rsp_chan ), + .slv_rsp_valid_o ( tile_rsp_valid ), + .slv_rsp_ready_i ( tile_rsp_ready ), + .slv_sel_i ( tile_sel[NumTilesPerGroup*NumClusterMst-1:0] ), + .slv_rr_i ( '0 ), + .slv_selected_o ( /* unused */ ), + .mst_req_o ( l2_req_chan ), + .mst_req_valid_o ( l2_req_valid ), + .mst_req_ready_i ( l2_req_ready ), + .mst_rsp_i ( l2_rsp_chan ), + .mst_rr_i ( l2_rsp_rr ), + .mst_rsp_valid_i ( l2_rsp_valid ), + .mst_rsp_ready_o ( l2_rsp_ready ), + .mst_sel_i ( l2_sel ) + ); + + // --------------------- + // l2_req/rsp packing: bridge xbar channels <-> l2_req_t/l2_rsp_t port + // --------------------- + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_pack + always_comb begin + // Request: xbar -> group output port + l2_req_o[ch].q = '{ + addr : l2_req_chan[ch].addr, + write : l2_req_chan[ch].write, + amo : l2_req_chan[ch].amo, + data : l2_req_chan[ch].data, + strb : l2_req_chan[ch].strb, + size : l2_req_chan[ch].size, + default: '0 + }; + l2_req_o[ch].q.user = l2_req_chan[ch].user; + l2_req_o[ch].q_valid = l2_req_valid[ch]; + l2_req_ready[ch] = l2_rsp_i[ch].q_ready; + + // Response: group input port -> xbar + l2_rsp_chan[ch] = '{ + data : l2_rsp_i[ch].p.data, + error : l2_rsp_i[ch].p.error, + write : l2_rsp_i[ch].p.write, + default: '0 + }; + l2_rsp_chan[ch].user = l2_rsp_i[ch].p.user; + l2_rsp_valid[ch] = l2_rsp_i[ch].p_valid; + l2_req_o[ch].p_ready = l2_rsp_ready[ch]; + + // Response demux: which xbar input port does this response target? + l2_sel[ch] = l2_rsp_i[ch].p.user.tile_id * NumClusterMst + + l2_rsp_i[ch].p.user.bank_id; + end + end + // Tile remote access signals // In/Out relative to the tile (out--leave a tile; in--enter a tile) // Tile-side flat layout: index = j + r*NrTCDMPortsPerCore (j=xbar idx, r=remote slot within xbar) - tcdm_req_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_out_req; - tcdm_rsp_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_out_rsp; - logic [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_ready, tile_remote_out_ready; + tcdm_req_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_out_req; + tcdm_rsp_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_out_rsp; + logic [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_ready, tile_remote_out_ready; - tcdm_req_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_req; - tcdm_rsp_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_rsp; + tcdm_req_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_req; + tcdm_rsp_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_rsp; - // Xbar-side: NrTCDMPortsPerCore xbars, each with NumTiles*NumRemotePortCore ports + // Xbar-side: NrTCDMPortsPerCore xbars, each with NumTilesPerGroup*NumRemotePortCore ports // Xbar port index = t*NumRemotePortCore + r - tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_req_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_req_valid, tile_remote_out_req_ready; - tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_rsp_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_rsp_valid, tile_remote_out_rsp_ready; + tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_req_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_req_valid, tile_remote_out_req_ready; + tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_rsp_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_rsp_valid, tile_remote_out_rsp_ready; + + tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_req_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_req_valid, tile_remote_in_req_ready; + tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_rsp_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_rsp_valid, tile_remote_in_rsp_ready; - tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_req_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_req_valid, tile_remote_in_req_ready; - tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_rsp_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_rsp_valid, tile_remote_in_rsp_ready; + // Per-group override of package-level remote xbar selection width. + // The package uses NumTiles (total), but the group's xbar is sized per-group. + localparam int unsigned LocalRemoteXbarSelWidth = $clog2(NumTilesPerGroup * NumRemotePortCore); + typedef logic [LocalRemoteXbarSelWidth-1:0] local_remote_xbar_sel_t; // Tile-side selection: narrow type, only carries tile_id - remote_tile_sel_t [NumTiles-1:0][NumRemotePortTile-1:0] remote_out_sel_tile; + remote_tile_sel_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] remote_out_sel_tile; // Xbar-side selection: wider type, encodes tile_id*NumRemotePortCore + core_id%NumRemotePortCore - remote_xbar_sel_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] remote_out_sel_xbar, remote_in_sel_xbar; + local_remote_xbar_sel_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] remote_out_sel_xbar, remote_in_sel_xbar; - for (genvar t = 0; t < NumTiles; t++) begin + for (genvar t = 0; t < NumTilesPerGroup; t++) begin for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin for (genvar r = 0; r < NumRemotePortCore; r++) begin // tile flat index: j + r*NrTCDMPortsPerCore @@ -265,113 +683,197 @@ module cachepool_group assign tile_remote_in_rsp_valid[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].p_valid; assign tile_remote_in_req_ready[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].q_ready; - // Request selection: route to target tile's remote-in slot based on - // target tile ID, so that all requests to the same destination tile - // travel through one pipeline — preserving write-before-read ordering. - assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'( + assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = local_remote_xbar_sel_t'( remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] * NumRemotePortCore - + remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] % NumRemotePortCore); + + tile_remote_out_req_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); - // Response selection: route back to source tile's remote-out slot. - // The originator (tile_id in user field) sent on slot - // (target_tile % NumRemotePortCore). The responding tile is `t` - // (genvar), so target_tile = t. - assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'( + assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = local_remote_xbar_sel_t'( tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.tile_id * NumRemotePortCore - + t % NumRemotePortCore); + + tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); end end end - for (genvar t = 0; t < NumTiles; t ++) begin : gen_tiles + for (genvar t = 0; t < NumTilesPerGroup; t ++) begin : gen_tiles logic [9:0] hart_base_id; assign hart_base_id = hart_base_id_i + t * NumCoresTile; logic [TileIDWidth-1:0] tile_id; - assign tile_id = t; - - cachepool_tile #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NumCoresTile ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), - .TileIDWidth ( TileIDWidth ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_tile ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error[t] ), - // TODO: remove hardcode - .debug_req_i ( debug_req_i [t*NumCoresTile+:NumCoresTile] ), - .meip_i ( meip_i [t*NumCoresTile+:NumCoresTile] ), - .mtip_i ( mtip_i [t*NumCoresTile+:NumCoresTile] ), - .msip_i ( msip_i [t*NumCoresTile+:NumCoresTile] ), - .hart_base_id_i ( hart_base_id ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .tile_id_i ( tile_id ), - .private_start_addr_i ( private_start_addr_i ), - // AXI out for UART - .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), - .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), - // Remote Access Ports - .remote_req_o ( tile_remote_out_req[t] ), - .remote_req_dst_o ( remote_out_sel_tile[t] ), - .remote_rsp_i ( tile_remote_out_rsp[t] ), - .remote_rsp_ready_i ( tile_remote_out_ready[t] ), - .remote_req_i ( tile_remote_in_req [t] ), - .remote_rsp_o ( tile_remote_in_rsp [t] ), - .remote_rsp_ready_o ( tile_remote_in_ready[t] ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req_o[t*NumL1CtrlTile+:NumL1CtrlTile] ), - .cache_refill_rsp_i ( cache_refill_rsp_i[t*NumL1CtrlTile+:NumL1CtrlTile] ), - // BootROM / Core-side Cache Bypass - .axi_wide_req_o ( axi_wide_req_o [t*TileWideAxiPorts+:TileWideAxiPorts] ), - .axi_wide_rsp_i ( axi_wide_rsp_i [t*TileWideAxiPorts+:TileWideAxiPorts] ), - // Peripherals - .icache_events_o ( /* unused */ ), - .icache_prefetch_enable_i ( icache_prefetch_enable_i ), - .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), - .dynamic_offset_i ( dynamic_offset_i ), - .l1d_insn_i ( l1d_insn_i ), - .l1d_private_i ( l1d_private_i ), - .l1d_insn_valid_i ( l1d_insn_valid_i ), - .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), - .l1d_busy_i ( l1d_busy_i [t] ) - ); + assign tile_id = tile_base_id_i + TileIDWidth'(t); + + if (NumRemoteGroupPortCore == 0) begin : gen_tile + cachepool_tile #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoresTile ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .TileIDWidth ( TileIDWidth ), + .NumRemoteGroupPortCore ( NumRemoteGroupPortCore ), + .NumTilesPerGroup ( NumTilesPerGroup ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_tile ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( error [t] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .tile_id_i ( tile_id ), + .private_start_addr_i ( private_start_addr_i ), + // AXI out for UART + .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + // Remote Access Ports + .remote_req_o ( tile_remote_out_req [t] ), + .remote_req_dst_o ( remote_out_sel_tile [t] ), + .remote_rsp_i ( tile_remote_out_rsp [t] ), + .remote_rsp_ready_i ( tile_remote_out_ready[t] ), + .remote_req_i ( tile_remote_in_req [t] ), + .remote_rsp_o ( tile_remote_in_rsp [t] ), + .remote_rsp_ready_o ( tile_remote_in_ready [t] ), + // Inter-group Remote Access Ports (directly exposed to group I/O) + .remote_group_req_o ( ), + .remote_group_rsp_i ( '0 ), + .remote_group_req_i ( '0 ), + .remote_group_rsp_o ( ), + // Cache Refill Ports (now internal, connected to group-level xbar) + .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), + // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) + .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), + .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), + .l1d_busy_i ( l1d_busy_i [t] ) + ); + end else begin : gen_tile + cachepool_tile #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoresTile ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .TileIDWidth ( TileIDWidth ), + .NumRemoteGroupPortCore ( NumRemoteGroupPortCore ), + .NumTilesPerGroup ( NumTilesPerGroup ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_tile ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( error [t] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .tile_id_i ( tile_id ), + .private_start_addr_i ( private_start_addr_i ), + // AXI out for UART + .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + // Remote Access Ports + .remote_req_o ( tile_remote_out_req [t] ), + .remote_req_dst_o ( remote_out_sel_tile [t] ), + .remote_rsp_i ( tile_remote_out_rsp [t] ), + .remote_rsp_ready_i ( tile_remote_out_ready[t] ), + .remote_req_i ( tile_remote_in_req [t] ), + .remote_rsp_o ( tile_remote_in_rsp [t] ), + .remote_rsp_ready_o ( tile_remote_in_ready [t] ), + // Inter-group Remote Access Ports (directly exposed to group I/O) + .remote_group_req_o ( remote_group_req_o [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_rsp_i ( remote_group_rsp_i [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_req_i ( remote_group_req_i [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_rsp_o ( remote_group_rsp_o [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + // Cache Refill Ports (now internal, connected to group-level xbar) + .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), + // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) + .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), + .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), + .l1d_busy_i ( l1d_busy_i [t] ) + ); + end end // ------------ @@ -379,11 +881,10 @@ module cachepool_group // ------------ for (genvar p = 0; p < NrTCDMPortsPerCore; p++) begin : gen_remote_tile_xbar - // Decide which tile to go reqrsp_xbar #( - .NumInp (NumTiles * NumRemotePortCore ), - .NumOut (NumTiles * NumRemotePortCore ), + .NumInp (NumTilesPerGroup * NumRemotePortCore ), + .NumOut (NumTilesPerGroup * NumRemotePortCore ), .PipeReg (1'b1 ), .RspReg (1'b1 ), .ExtReqPrio (1'b0 ), diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv new file mode 100644 index 0000000..85ea868 --- /dev/null +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -0,0 +1,596 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Wrapper around cachepool_group that handles inter-group +// interconnection: master-side concentration xbar, flit packing, floo_router +// instances (req + rsp), and a slave-side dispatch xbar. +// +// Author: Diyou Shen + + +module cachepool_group_noc_wrapper + import cachepool_pkg::*; + import floo_pkg::*; + import spatz_pkg::*; + import fpnew_pkg::fpu_implementation_t; + import snitch_pma_pkg::snitch_pma_t; + import snitch_icache_pkg::icache_l1_events_t; + #( + parameter int unsigned AxiAddrWidth = 48, + parameter int unsigned AxiDataWidth = 512, + parameter int unsigned AxiIdWidthIn = 2, + parameter int unsigned AxiIdWidthOut = 2, + parameter int unsigned AxiUserWidth = 1, + parameter logic [31:0] BootAddr = 32'h0, + parameter logic [31:0] UartAddr = 32'h0, + parameter int unsigned NrCores = 0, + parameter int unsigned TCDMDepth = 1024, + parameter int unsigned ClusterPeriphSize = 64, + parameter int unsigned NrBanks = 2 * NrCores, + parameter int unsigned ICacheLineWidth = 0, + parameter int unsigned ICacheLineCount = 0, + parameter int unsigned ICacheSets = 0, + parameter fpu_implementation_t FPUImplementation = fpu_implementation_t'(0), + parameter int unsigned NumSpatzFPUs = 1, + parameter int unsigned NumSpatzIPUs = 1, + parameter snitch_pma_t SnitchPMACfg = '0, + parameter int unsigned NumIntOutstandingLoads = 1, + parameter int unsigned NumIntOutstandingMem = 4, + parameter int unsigned NumSpatzOutstandingLoads = 4, + parameter bit RegisterOffloadRsp = 1, + parameter bit RegisterCoreReq = 0, + parameter bit RegisterCoreRsp = 0, + parameter bit RegisterTCDMCuts = 1'b0, + parameter bit RegisterExt = 1'b0, + parameter axi_pkg::xbar_latency_e XbarLatency = axi_pkg::CUT_ALL_PORTS, + parameter int unsigned MaxMstTrans = 4, + parameter int unsigned MaxSlvTrans = 4, + parameter type axi_in_req_t = logic, + parameter type axi_in_resp_t = logic, + parameter type axi_narrow_req_t = logic, + parameter type axi_narrow_resp_t = logic, + parameter type axi_out_req_t = logic, + parameter type axi_out_resp_t = logic, + parameter type impl_in_t = logic, + parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, + parameter int unsigned NrSramCfg = 1 + ) ( + input logic clk_i, + input logic rst_ni, + input logic debug_req_i, + input logic meip_i, + input logic mtip_i, + input logic msip_i, + input logic [9:0] hart_base_id_i, + input logic [TileIDWidth-1:0] tile_base_id_i, + input axi_addr_t cluster_base_addr_i, + input axi_addr_t private_start_addr_i, + output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, + input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, + output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, + input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, + output icache_l1_events_t [NrCores-1:0] icache_events_o, + input logic icache_prefetch_enable_i, + input logic [NrCores-1:0] cl_interrupt_i, + input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, + input logic [3:0] l1d_private_i, + input cache_insn_t l1d_insn_i, + input logic l1d_insn_valid_i, + output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, + input logic [NumTilesPerGroup-1:0] l1d_busy_i, + input impl_in_t [NrSramCfg-1:0] impl_i, + output logic error_o, + // XY coordinates of this group in the inter-group mesh + input group_xy_id_t group_xy_id_i, + // Inter-group req mesh: 4 directions (N=0,E=1,S=2,W=3) + // dim1: direction, dim2: tile*NumNoCPortsPerTile+channel + output noc_group_req_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_o, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_valid_o, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_ready_i, + input noc_group_req_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_i, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_valid_i, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_ready_o, + // Inter-group rsp mesh + output noc_group_rsp_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_o, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_valid_o, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_ready_i, + input noc_group_rsp_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_i, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_valid_i, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_ready_o + ); + + + // ------------------------------------------------------------------------- + // Localparams + // ------------------------------------------------------------------------- + localparam int unsigned NumRemoteGroupPortTile = (NumRemoteGroupPortCore == 0) ? 1 + : NumRemoteGroupPortCore * NrTCDMPortsPerCore; + localparam int unsigned NumRemoteGroupPortGroup = NumRemoteGroupPortTile * NumTilesPerGroup; + localparam int unsigned NumNoCPortsGroup = NumNoCPortsPerTile * NumTilesPerGroup; + localparam int unsigned SlvXbarSelW = (NumRemoteGroupPortGroup > 1) ? $clog2(NumRemoteGroupPortGroup) : 1; + localparam int unsigned MstXbarSelW = (NumNoCPortsGroup > 1) ? $clog2(NumNoCPortsGroup) : 1; + + // -- Struct / xbar field widths (always >= 1 to avoid zero-width ports) ------ + localparam int unsigned NocCacheBankBits = $clog2(NrBanks); + localparam int unsigned NocAddrTileWidth = (NumTilesPerGroup > 1) ? $clog2(NumTilesPerGroup) : 1; + // -- Actual bit counts inside dst_tile_id (can be 0 when that dimension = 1) - + // dst_tile_id layout: [ group_y (NocGroupBitsY) | group_x (NocGroupBitsX) | local_tile (NocGroupOffset) ] + // where NocGroupOffset = $clog2(NumTilesPerGroup) (0 when NumTilesPerGroup == 1). + localparam int unsigned NocGroupOffset = $clog2(NumTilesPerGroup); + localparam int unsigned NocGroupBitsX = (NumGroupsX > 1) ? $clog2(NumGroupsX) : 0; + localparam int unsigned NocGroupBitsY = (NumGroupsY > 1) ? $clog2(NumGroupsY) : 0; + + + // ------------------------------------------------------------------------- + // Group ↔ wrapper boundary signals + // ------------------------------------------------------------------------- + remote_group_req_t [NumRemoteGroupPortGroup-1:0] remote_group_req_to_group; + remote_group_rsp_t [NumRemoteGroupPortGroup-1:0] remote_group_rsp_from_group; + remote_group_req_t [NumRemoteGroupPortGroup-1:0] remote_group_req_from_group; + remote_group_rsp_t [NumRemoteGroupPortGroup-1:0] remote_group_rsp_to_group; + + + // ------------------------------------------------------------------------- + // Mesh signals [tile][ch][dir=3:0] and transposition to/from ports + // ------------------------------------------------------------------------- + noc_group_req_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out_ready; + noc_group_req_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in_ready; + + noc_group_rsp_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out_ready; + noc_group_rsp_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in_ready; + + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_mesh_trans_t + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_mesh_trans_n + for (genvar d = 0; d < 4; d++) begin : gen_mesh_trans_d + // Mute the channel when not valid for debugging + assign noc_req_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out_valid[t][n][d] ? req_mesh_out[t][n][d] : '0; + assign noc_req_valid_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out_valid[t][n][d]; + assign req_mesh_out_ready[t][n][d] = noc_req_ready_i[d][t*NumNoCPortsPerTile+n]; + assign req_mesh_in[t][n][d] = noc_req_i[d][t*NumNoCPortsPerTile+n]; + assign req_mesh_in_valid[t][n][d] = noc_req_valid_i[d][t*NumNoCPortsPerTile+n]; + assign noc_req_ready_o[d][t*NumNoCPortsPerTile+n] = req_mesh_in_ready[t][n][d]; + + assign noc_rsp_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out_valid[t][n][d] ? rsp_mesh_out[t][n][d] : '0; + assign noc_rsp_valid_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out_valid[t][n][d]; + assign rsp_mesh_out_ready[t][n][d] = noc_rsp_ready_i[d][t*NumNoCPortsPerTile+n]; + assign rsp_mesh_in[t][n][d] = noc_rsp_i[d][t*NumNoCPortsPerTile+n]; + assign rsp_mesh_in_valid[t][n][d] = noc_rsp_valid_i[d][t*NumNoCPortsPerTile+n]; + assign noc_rsp_ready_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_in_ready[t][n][d]; + end + end + end + + + if (NumRemoteGroupPortCore > 0) begin : gen_noc + + // ----------------------------------------------------------------------- + // Router inject/eject signals (flat 1D index noc_port = t*NumNoCPortsPerTile+n) + // ----------------------------------------------------------------------- + noc_group_req_t [NumNoCPortsGroup-1:0] packed_req; + logic [NumNoCPortsGroup-1:0] packed_req_valid; + logic [NumNoCPortsGroup-1:0] packed_req_ready; + + noc_group_req_t [NumNoCPortsGroup-1:0] eject_req; + logic [NumNoCPortsGroup-1:0] eject_req_valid; + logic [NumNoCPortsGroup-1:0] eject_req_ready; + + noc_group_rsp_t [NumNoCPortsGroup-1:0] inject_rsp; + logic [NumNoCPortsGroup-1:0] inject_rsp_valid; + logic [NumNoCPortsGroup-1:0] inject_rsp_ready; + + noc_group_rsp_t [NumNoCPortsGroup-1:0] eject_rsp; + logic [NumNoCPortsGroup-1:0] eject_rsp_valid; + logic [NumNoCPortsGroup-1:0] eject_rsp_ready; + + // Master xbar output (one concentrated req/rsp channel per tile/channel) + remote_group_req_chan_t [NumNoCPortsGroup-1:0] mst_xbar_req; + logic [NumNoCPortsGroup-1:0] mst_xbar_req_valid; + logic [NumNoCPortsGroup-1:0] mst_xbar_req_ready; + + // Slave xbar signals + noc_group_req_t [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req_valid; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req_ready; + noc_group_rsp_t [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp_valid; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp_ready; + noc_group_rsp_t [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp; + logic [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp_valid; + logic [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp_ready; + + logic [NumNoCPortsGroup-1:0][SlvXbarSelW-1:0] slv_xbar_slv_sel; + logic [NumRemoteGroupPortGroup-1:0][MstXbarSelW-1:0] slv_xbar_mst_sel; + + + // ----------------------------------------------------------------------- + // Master-side per-tile concentration xbar + flit packing + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_mst_t + + remote_group_req_chan_t [NumRemoteGroupPortTile-1:0] mst_slv_req; + logic [NumRemoteGroupPortTile-1:0] mst_slv_req_valid; + logic [NumRemoteGroupPortTile-1:0] mst_slv_req_ready; + remote_group_rsp_chan_t [NumRemoteGroupPortTile-1:0] mst_slv_rsp; + logic [NumRemoteGroupPortTile-1:0] mst_slv_rsp_valid; + logic [NumRemoteGroupPortTile-1:0] mst_slv_rsp_ready; + remote_group_rsp_chan_t [NumNoCPortsPerTile-1:0] eject_rsp_payload; + portid_t [NumNoCPortsPerTile-1:0] mst_xbar_mst_sel; + portid_t [NumNoCPortsPerTile-1:0] mst_xbar_slv_selected; + + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_mst_port_p + assign mst_slv_req[p] = remote_group_req_from_group[t*NumRemoteGroupPortTile+p].q; + assign mst_slv_req_valid[p] = remote_group_req_from_group[t*NumRemoteGroupPortTile+p].q_valid; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].q_ready = mst_slv_req_ready[p]; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].p = mst_slv_rsp[p]; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].p_valid = mst_slv_rsp_valid[p]; + assign mst_slv_rsp_ready[p] = + remote_group_req_from_group[t*NumRemoteGroupPortTile+p].p_ready; + end + + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_mst_eject_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + assign eject_rsp_payload[n] = eject_rsp[noc_port].payload; + assign mst_xbar_mst_sel[n] = eject_rsp[noc_port].hdr.src_port_id; + end + + // Static port-to-NoC-channel mapping: each flat port p has xbar index + // j = p % NrTCDMPortsPerCore, and is steered to NoC channel j % NumNoCPortsPerTile. + // Spatz ports (j=0..NrTCDMPortsPerCore-2) divide evenly across channels; + // Snitch (j=NrTCDMPortsPerCore-1) maps by the same modulo. + localparam int unsigned NocMstSelWidth = (NumNoCPortsPerTile > 1) + ? $clog2(NumNoCPortsPerTile) : 1; + logic [NumRemoteGroupPortTile-1:0][NocMstSelWidth-1:0] noc_mst_sel; + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_noc_mst_sel + assign noc_mst_sel[p] = NocMstSelWidth'((p % NrTCDMPortsPerCore) % NumNoCPortsPerTile); + end + + reqrsp_xbar #( + .NumInp ( NumRemoteGroupPortTile ), + .NumOut ( NumNoCPortsPerTile ), + .tcdm_req_chan_t ( remote_group_req_chan_t ), + .tcdm_rsp_chan_t ( remote_group_rsp_chan_t ) + ) i_noc_mst_xbar ( + .clk_i, + .rst_ni, + .slv_req_i ( mst_slv_req ), + .slv_rr_i ( '0 ), + .slv_req_valid_i ( mst_slv_req_valid ), + .slv_req_ready_o ( mst_slv_req_ready ), + .slv_rsp_o ( mst_slv_rsp ), + .slv_rsp_valid_o ( mst_slv_rsp_valid ), + .slv_rsp_ready_i ( mst_slv_rsp_ready ), + .slv_sel_i ( noc_mst_sel ), + .slv_selected_o ( mst_xbar_slv_selected ), + .mst_req_o ( mst_xbar_req[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_req_valid_o ( mst_xbar_req_valid[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_req_ready_i ( mst_xbar_req_ready[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_rsp_i ( eject_rsp_payload ), + .mst_rr_i ( '0 ), + .mst_rsp_valid_i ( eject_rsp_valid[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_rsp_ready_o ( eject_rsp_ready[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_sel_i ( mst_xbar_mst_sel ) + ); + + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_pack_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + assign packed_req[noc_port].hdr.collective_op = '0; + assign packed_req[noc_port].hdr.src_id = group_xy_id_i; + // dst_tile_id set by tcdm_cache_interco: bits [NocGroupOffset +: NocGroupBitsX] = group_x, + // bits [(NocGroupOffset+NocGroupBitsX) +: NocGroupBitsY] = group_y. + // When a dimension has only 1 group, no bits are consumed and the coordinate is 0. + if (NumGroupsX > 1) begin : gen_dst_x + assign packed_req[noc_port].hdr.dst_id.x = + mst_xbar_req[noc_port].user.dst_tile_id[NocGroupOffset +: NocGroupBitsX]; + end else begin : gen_dst_x + assign packed_req[noc_port].hdr.dst_id.x = '0; + end + if (NumGroupsY > 1) begin : gen_dst_y + assign packed_req[noc_port].hdr.dst_id.y = + mst_xbar_req[noc_port].user.dst_tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; + end else begin : gen_dst_y + assign packed_req[noc_port].hdr.dst_id.y = '0; + end + assign packed_req[noc_port].hdr.dst_id.port_id = '0; + assign packed_req[noc_port].hdr.src_tile_id = group_tile_sel_t'(t); + assign packed_req[noc_port].hdr.src_port_id = mst_xbar_slv_selected[n]; + assign packed_req[noc_port].hdr.last = 1'b1; + assign packed_req[noc_port].payload = mst_xbar_req[noc_port]; + assign packed_req_valid[noc_port] = mst_xbar_req_valid[noc_port]; + assign mst_xbar_req_ready[noc_port] = packed_req_ready[noc_port]; + + end + + end : gen_mst_t + + + // ----------------------------------------------------------------------- + // Per-tile per-channel req floo_router + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_req_router_t + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_req_router_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + floo_router #( + .NumRoutes ( 5 ), + .NumVirtChannels ( 1 ), + .NumPhysChannels ( 1 ), + .InFifoDepth ( 2 ), + .OutFifoDepth ( 0 ), + .RouteAlgo ( XYRouting ), + .IdWidth ( $bits(group_xy_id_t) ), + .id_t ( group_xy_id_t ), + .NumAddrRules ( 1 ), + .addr_rule_t ( logic ), + .flit_t ( noc_group_req_t ), + .hdr_t ( noc_group_hdr_t ) + ) i_req_router ( + .clk_i, + .rst_ni, + .test_enable_i ( 1'b0 ), + .xy_id_i ( group_xy_id_i ), + .id_route_map_i ( '0 ), + .valid_i ( {packed_req_valid[noc_port], + req_mesh_in_valid[t][n][3:0]} ), + .ready_o ( {packed_req_ready[noc_port], + req_mesh_in_ready[t][n][3:0]} ), + .data_i ( {packed_req[noc_port], + req_mesh_in[t][n][3:0]} ), + .credit_o ( ), + .valid_o ( {eject_req_valid[noc_port], + req_mesh_out_valid[t][n][3:0]} ), + .ready_i ( {eject_req_ready[noc_port], + req_mesh_out_ready[t][n][3:0]} ), + .data_o ( {eject_req[noc_port], + req_mesh_out[t][n][3:0]} ), + .credit_i ( '1 ), + .offload_req_o ( ), + .offload_rsp_i ( '0 ) + ); + end + end + + + // ----------------------------------------------------------------------- + // Per-tile per-channel rsp floo_router + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_rsp_router_t + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_rsp_router_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + floo_router #( + .NumRoutes ( 5 ), + .NumVirtChannels ( 1 ), + .NumPhysChannels ( 1 ), + .InFifoDepth ( 2 ), + .OutFifoDepth ( 0 ), + .RouteAlgo ( XYRouting ), + .IdWidth ( $bits(group_xy_id_t) ), + .id_t ( group_xy_id_t ), + .NumAddrRules ( 1 ), + .addr_rule_t ( logic ), + .flit_t ( noc_group_rsp_t ), + .hdr_t ( noc_group_hdr_t ) + ) i_rsp_router ( + .clk_i, + .rst_ni, + .test_enable_i ( 1'b0 ), + .xy_id_i ( group_xy_id_i ), + .id_route_map_i ( '0 ), + .valid_i ( {inject_rsp_valid[noc_port], + rsp_mesh_in_valid[t][n][3:0]} ), + .ready_o ( {inject_rsp_ready[noc_port], + rsp_mesh_in_ready[t][n][3:0]} ), + .data_i ( {inject_rsp[noc_port], + rsp_mesh_in[t][n][3:0]} ), + .credit_o ( ), + .valid_o ( {eject_rsp_valid[noc_port], + rsp_mesh_out_valid[t][n][3:0]} ), + .ready_i ( {eject_rsp_ready[noc_port], + rsp_mesh_out_ready[t][n][3:0]} ), + .data_o ( {eject_rsp[noc_port], + rsp_mesh_out[t][n][3:0]} ), + .credit_i ( '1 ), + .offload_req_o ( ), + .offload_rsp_i ( '0 ) + ); + end + end + + + // ----------------------------------------------------------------------- + // Slave xbar selection signals + inject_rsp ↔ slv_xbar_slv_rsp + // ----------------------------------------------------------------------- + for (genvar noc_port = 0; noc_port < NumNoCPortsGroup; noc_port++) begin : gen_slv_sel + assign slv_xbar_slv_sel[noc_port] = (NumTilesPerGroup == 1) + ? SlvXbarSelW'(eject_req[noc_port].hdr.src_port_id) + : SlvXbarSelW'(eject_req[noc_port].payload.addr[(dynamic_offset_i + NocCacheBankBits) +: NocAddrTileWidth] + * NumRemoteGroupPortTile + + eject_req[noc_port].hdr.src_port_id); + + end + + assign inject_rsp = slv_xbar_slv_rsp; + assign inject_rsp_valid = slv_xbar_slv_rsp_valid; + assign slv_xbar_slv_rsp_ready = inject_rsp_ready; + + + // ----------------------------------------------------------------------- + // Slave-side group-wide dispatch xbar + // ----------------------------------------------------------------------- + reqrsp_xbar #( + .NumInp ( NumNoCPortsGroup ), + .NumOut ( NumRemoteGroupPortGroup), + .tcdm_req_chan_t ( noc_group_req_t ), + .tcdm_rsp_chan_t ( noc_group_rsp_t ) + ) i_noc_slv_xbar ( + .clk_i, + .rst_ni, + .slv_req_i ( eject_req ), + .slv_rr_i ( '0 ), + .slv_req_valid_i ( eject_req_valid ), + .slv_req_ready_o ( eject_req_ready ), + .slv_rsp_o ( slv_xbar_slv_rsp ), + .slv_rsp_valid_o ( slv_xbar_slv_rsp_valid ), + .slv_rsp_ready_i ( slv_xbar_slv_rsp_ready ), + .slv_sel_i ( slv_xbar_slv_sel ), + .slv_selected_o ( ), + .mst_req_o ( slv_xbar_mst_req ), + .mst_req_valid_o ( slv_xbar_mst_req_valid ), + .mst_req_ready_i ( slv_xbar_mst_req_ready ), + .mst_rsp_i ( slv_xbar_mst_rsp ), + .mst_rr_i ( '0 ), + .mst_rsp_valid_i ( slv_xbar_mst_rsp_valid ), + .mst_rsp_ready_o ( slv_xbar_mst_rsp_ready ), + .mst_sel_i ( slv_xbar_mst_sel ) + ); + + + // ----------------------------------------------------------------------- + // Slave delivery: unpack xbar output → group slave ports + rsp packing + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_slv_deliver_t + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_slv_deliver_p + localparam int unsigned port = t * NumRemoteGroupPortTile + p; + + // Placeholder response routing: route response back via the NoC channel + // of the same tile (t). Correct cross-tile response routing is deferred. + assign slv_xbar_mst_sel[port] = MstXbarSelW'(t * NumNoCPortsPerTile); + + always_comb begin : proc_req_unpack + remote_group_req_to_group[port].q = slv_xbar_mst_req[port].payload; + remote_group_req_to_group[port].q.user.src_group_x = + slv_xbar_mst_req[port].hdr.src_id.x; + remote_group_req_to_group[port].q.user.src_group_y = + slv_xbar_mst_req[port].hdr.src_id.y; + end + + assign remote_group_req_to_group[port].q_valid = slv_xbar_mst_req_valid[port]; + assign slv_xbar_mst_req_ready[port] = + remote_group_rsp_from_group[port].q_ready; + assign remote_group_req_to_group[port].p_ready = slv_xbar_mst_rsp_ready[port]; + + + assign slv_xbar_mst_rsp[port].payload = + remote_group_rsp_from_group[port].p; + assign slv_xbar_mst_rsp[port].hdr.collective_op = '0; + assign slv_xbar_mst_rsp[port].hdr.src_id = group_xy_id_i; + if (NumGroupsX > 1) begin : gen_rsp_dst_x + assign slv_xbar_mst_rsp[port].hdr.dst_id.x = + remote_group_rsp_from_group[port].p.user.tile_id[NocGroupOffset +: NocGroupBitsX]; + end else begin : gen_rsp_dst_x + assign slv_xbar_mst_rsp[port].hdr.dst_id.x = '0; + end + if (NumGroupsY > 1) begin : gen_rsp_dst_y + assign slv_xbar_mst_rsp[port].hdr.dst_id.y = + remote_group_rsp_from_group[port].p.user.tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; + end else begin : gen_rsp_dst_y + assign slv_xbar_mst_rsp[port].hdr.dst_id.y = '0; + end + assign slv_xbar_mst_rsp[port].hdr.dst_id.port_id = '0; + assign slv_xbar_mst_rsp[port].hdr.src_tile_id = group_tile_sel_t'(t); + assign slv_xbar_mst_rsp[port].hdr.src_port_id = remote_group_rsp_from_group[port].p.user.port_id; + assign slv_xbar_mst_rsp[port].hdr.last = 1'b1; + assign slv_xbar_mst_rsp_valid[port] = + remote_group_rsp_from_group[port].p_valid; + end + end + + + end else begin : gen_noc_disabled + + assign remote_group_req_to_group = '0; + assign remote_group_rsp_to_group = '0; + assign req_mesh_out = '0; + assign req_mesh_out_valid = '0; + assign req_mesh_in_ready = '0; + assign rsp_mesh_out = '0; + assign rsp_mesh_out_valid = '0; + assign rsp_mesh_in_ready = '0; + + end + + + // ------------------------------------------------------------------------- + // Group instantiation + // ------------------------------------------------------------------------- + cachepool_group #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( AxiIdWidthOut ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NrCores ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_out_req_t ), + .axi_out_resp_t ( axi_out_resp_t ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_group ( + .clk_i, + .rst_ni, + .impl_i ( impl_i ), + .error_o ( error_o ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id_i ), + .tile_base_id_i ( tile_base_id_i ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .private_start_addr_i ( private_start_addr_i ), + .axi_narrow_req_o ( axi_narrow_req_o ), + .axi_narrow_rsp_i ( axi_narrow_rsp_i ), + .l2_req_o ( l2_req_o ), + .l2_rsp_i ( l2_rsp_i ), + .remote_group_req_o ( remote_group_req_from_group ), + .remote_group_rsp_i ( remote_group_rsp_to_group ), + .remote_group_req_i ( remote_group_req_to_group ), + .remote_group_rsp_o ( remote_group_rsp_from_group ), + .icache_events_o ( icache_events_o ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o ), + .l1d_busy_i ( l1d_busy_i ) + ); + +endmodule diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index 737bc70..a926a2a 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -46,25 +46,45 @@ package cachepool_pkg; localparam int unsigned NumIntOutstandingMem = `ifdef SNITCH_MAX_TRANS `SNITCH_MAX_TRANS `else 0 `endif; localparam int unsigned NumSpatzOutstandingLoads = `ifdef SPATZ_MAX_TRANS `SPATZ_MAX_TRANS `else 0 `endif; - localparam int unsigned NumAxiMaxTrans = 32; + localparam int unsigned NumAxiMaxTrans = 64; /////////////////// // TILE CONFIG // /////////////////// // How many cores for each tile? - localparam int unsigned NumCoresTile = NumCores / NumTiles; + localparam int unsigned NumCoresTile = NumCores / NumTiles; // How many remote ports for each tile per core's port? - localparam int unsigned NumRemotePortCore = `ifdef REMOTE_PORT_PER_CORE `REMOTE_PORT_PER_CORE `else 0 `endif; + localparam int unsigned NumRemotePortCore = `ifdef REMOTE_PORT_PER_CORE `REMOTE_PORT_PER_CORE `else 0 `endif; // How many cores within a tile? This is used to select the ports within a tile. - localparam int unsigned LogNumCoresTile = $clog2(NumCoresTile); + localparam int unsigned LogNumCoresTile = $clog2(NumCoresTile); // 4 ports from Spatz + 1 shared port from Snitch/FPU - localparam int unsigned NrTCDMPortsPerCore = 5; + localparam int unsigned NrTCDMPortsPerCore = 5; // How many remote ports for each tile in total? - localparam int unsigned NumRemotePortTile = NumRemotePortCore * NrTCDMPortsPerCore; + localparam int unsigned NumRemotePortTile = NumRemotePortCore * NrTCDMPortsPerCore; + + //////////////////// + // GROUP CONFIG // + //////////////////// + // How many tiles for each group? + localparam int unsigned NumTilesPerGroup = NumTiles / NumGroups; + + // How many cores for each group? + localparam int unsigned NumCoreGroup = NumCores / NumGroups; + + // How many remote group ports for each tile? + localparam int unsigned NumRemoteGroupPortCore = `ifdef RG_PORT_PER_CORE `RG_PORT_PER_CORE `else 0 `endif; + + // Number of inter-group NoC router channels per tile (x in the 5-to-x concentration xbar). + localparam int unsigned NumNoCPortsPerTile = `ifdef NOC_PORT_PER_TILE `NOC_PORT_PER_TILE `else 1 `endif; + + // Group mesh dimensions. NumGroupsY is derived; NumGroupsX must be set via config. + localparam int unsigned NumGroupsX = `ifdef NUM_GROUPS_X `NUM_GROUPS_X `else 1 `endif; + localparam int unsigned NumGroupsY = NumGroups / NumGroupsX; + //////////////////// // CLUSTER HW // @@ -76,6 +96,12 @@ package cachepool_pkg; localparam int unsigned ICacheLineCount = 128; localparam int unsigned ICacheSets = 4; + // Group-level L2 ICache (shared read-only cache, primarily for coalescing) + localparam int unsigned L2ICacheLineWidth = 512; + localparam int unsigned L2ICacheSets = 4; + localparam int unsigned L2ICacheSizeByte = 65536; + localparam int unsigned L2ICacheLineCount = L2ICacheSizeByte / (L2ICacheSets * L2ICacheLineWidth / 8); + // Be careful on unsigned long int passed in from configuration. // Currently use fixed values. localparam int unsigned TCDMStartAddr = 32'hBFFF_F800; @@ -172,26 +198,63 @@ package cachepool_pkg; localparam int unsigned ClusterRouteIdWidth = $clog2(NumClusterMst); /***** ID Width Topology (Tile -> Group -> Cluster) *****/ + // TileAxiIdWidth: base iCache/DMA AXI ID bits per tile before tile-index bits are added. + // Determines how many outstanding refills the iCache can track (2^TileAxiIdWidth = 8). + // This is the "tile_local_bits" field described above. localparam int unsigned TileAxiIdWidth = 3; localparam int unsigned GroupAxiIdWidth = TileAxiIdWidth + $clog2(NumTiles); localparam int unsigned ClusterAxiIdWidth = GroupAxiIdWidth + ClusterRouteIdWidth; - - // legacy naming + // Alias used by the Spatz-generated wrapper and testbench templates. localparam int unsigned SpatzAxiIdInWidth = ClusterAxiIdWidth; - // localparam int unsigned SpatzAxiIdInWidth = TileAxiIdWidth; - localparam int unsigned SpatzAxiIdOutWidth = ClusterAxiIdWidth + 1; + + // Per-group AXI output ID width (pre multi-group mux). + // The +1 comes from reqrsp_to_axi, which tags each burst with one extra bit. + localparam int unsigned GroupAxiIdOutWidth = ClusterAxiIdWidth + 1; + // Bounded per-group refill ID width: uses NumTilesPerGroup (not NumTiles) so the + // ID space stays fixed regardless of total system size. axi_id_remap at each group + // output reduces GroupAxiIdOutWidth to this before the inter-group mux / future NoC. + // For NumGroups == 1, NumTilesPerGroup == NumTiles so this equals GroupAxiIdOutWidth. + localparam int unsigned WideRefillIdWidth = TileAxiIdWidth + $clog2(NumTilesPerGroup) + ClusterRouteIdWidth + 1; + // Cluster-level AXI output ID width: widened by multi-group mux. + // When NumGroups == 1, $clog2(1) == 0 so this equals WideRefillIdWidth == GroupAxiIdOutWidth. + localparam int unsigned GroupMuxIdBits = (NumGroups > 1) ? $clog2(NumGroups) : 0; + localparam int unsigned SpatzAxiIdOutWidth = WideRefillIdWidth + GroupMuxIdBits; // Fixed AXI ID width for IWC localparam int unsigned IwcAxiIdOutWidth = SpatzAxiIdOutWidth + 1; - localparam int unsigned CsrAxiMstIdWidth = ClusterAxiIdWidth; - localparam int unsigned CsrAxiSlvIdWidth = ClusterAxiIdWidth + $clog2(NumTiles+1); + // Cluster wrapper external output AXI ID width, after the wrapper-level axi_id_remap. + // Reduces the fat SpatzAxiIdOutWidth presented to the DRAM controller. + // Must satisfy: WrapperAxiIdOutWidth >= $clog2(NumAxiMaxTrans) = $clog2(32) = 5. + localparam int unsigned WrapperAxiIdOutWidth = 6; + // External SoC/testbench input AXI ID width (host → cluster direction). + // axi_id_remap in the wrapper expands these to SpatzAxiIdInWidth internally. + localparam int unsigned WrapperAxiIdInWidth = 4; + // External narrow output AXI ID width for the UART port (cluster → SoC direction). + // axi_id_remap in the wrapper compresses SpatzAxiUartIdWidth to this. + localparam int unsigned WrapperAxiNarrowIdOutWidth = 4; - // Base ID width 6, plus tile mux => adding clog(tile) - localparam int unsigned SpatzAxiNarrowIdWidth = 6 + $clog2(NumTiles); - // UART ID width, with an extra xbar + localparam int unsigned CsrAxiMstIdWidth = ClusterAxiIdWidth; + // ID width after per-master serialization before the CSR mux. + // axi_id_serialize at each CSR master reduces CsrAxiMstIdWidth to this, + // keeping the mux output (CsrAxiSlvIdWidth) bounded regardless of NumTiles. + // Must be > 1: axi_id_serialize internally uses axi_id_prepend which requires + // AxiMstPortIdWidth > MuxIdWidth (= 1 when AxiMstPortMaxUniqIds = 1). + localparam int unsigned CsrSerIdWidth = 2; + localparam int unsigned CsrAxiSlvIdWidth = CsrSerIdWidth + $clog2(NumTiles+1); + + // Narrow AXI ID width = ClusterAxiIdWidth (same field structure, used on the narrow path). + localparam int unsigned SpatzAxiNarrowIdWidth = ClusterAxiIdWidth; + // UART ID width: narrow path muxed across all tiles adds $clog2(NumTiles) bits. localparam int unsigned SpatzAxiUartIdWidth = SpatzAxiNarrowIdWidth + $clog2(NumTiles); + // BootROM AXI ID width: wide data bus, muxed from NumTilesPerGroup tile ports per group. + // The group's axi_mst_cache slave ID width = GroupAxiIdWidth + 1 + // (cluster passes WideIdWidthIn = SpatzAxiIdOutWidth - ClusterRouteIdWidth - GroupMuxIdBits + // = GroupAxiIdWidth + 1). + // The per-group BootROM mux master adds $clog2(NumTilesPerGroup) bits on top. + localparam int unsigned BootRomAxiSlvIdWidth = GroupAxiIdWidth + 1 + $clog2(NumTilesPerGroup); + /***** Tile Ports *****/ // We have three sets of AXI ports for each tile: // 1) Wide output bus for BootRom & L2 (from ICache) @@ -228,8 +291,6 @@ package cachepool_pkg; // Wide AXI ports: X to DRAM (X=4 for now) localparam int unsigned ClusterWideOutAxiPorts = NumL2Channel; - // TODO: multi-tile support - // One more from the Snitch core ////////////////// // L2 / DRAM // @@ -238,7 +299,7 @@ package cachepool_pkg; localparam int unsigned L2BankWidth = `ifdef L2_BANK_WIDTH `L2_BANK_WIDTH `else 0 `endif; localparam int unsigned L2BankBeWidth = L2BankWidth / 8; - parameter DramType = "DDR4"; // "DDR4", "DDR3", "HBM2", "LPDDR4" + parameter DramType = "HBM2"; // "DDR4", "DDR3", "HBM2", "LPDDR4" parameter int unsigned DramBase = 32'h8000_0000; // One more for UART? @@ -274,6 +335,7 @@ package cachepool_pkg; typedef logic [SpatzAxiIdInWidth-1:0] axi_id_in_t; typedef logic [SpatzAxiIdOutWidth-1:0] axi_id_out_t; + typedef logic [GroupAxiIdOutWidth-1:0] axi_id_group_out_t; typedef logic [SpatzAxiNarrowIdWidth-1:0] axi_narrow_id_t; // legacy name; TODO: remove @@ -282,9 +344,15 @@ package cachepool_pkg; typedef logic [SpatzAxiUartIdWidth-1:0] axi_uart_id_t; typedef logic [CsrAxiMstIdWidth-1:0] axi_id_csr_mst_t; + typedef logic [CsrSerIdWidth-1:0] axi_id_csr_ser_t; typedef logic [CsrAxiSlvIdWidth-1:0] axi_id_csr_slv_t; - typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; + typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; + typedef logic [WrapperAxiIdOutWidth-1:0] axi_id_wrapper_out_t; + typedef logic [WrapperAxiIdInWidth-1:0] axi_id_wrapper_in_t; + typedef logic [WrapperAxiNarrowIdOutWidth-1:0] axi_id_wrapper_narrow_out_t; + + typedef logic [BootRomAxiSlvIdWidth-1:0] axi_bootrom_slv_id_t; ////////////////// // TILE TYPES // @@ -362,7 +430,64 @@ package cachepool_pkg; // GROUP TYPES // /////////////////// - typedef logic [RemoteXbarSelWidth-1:0] remote_xbar_sel_t; + typedef logic [RemoteXbarSelWidth-1:0] remote_xbar_sel_t; + typedef logic [$clog2(NrTCDMPortsPerCore)-1:0] portid_t; + + typedef struct packed { + logic [CoreIDWidth-1:0] core_id; + logic [TileIDWidth-1:0] tile_id; + reqid_t req_id; + logic is_fpu; + portid_t port_id; + logic [idx_width(NumGroupsX)-1:0] src_group_x; + logic [idx_width(NumGroupsY)-1:0] src_group_y; + // Globally-unique destination tile ID, set by tcdm_cache_interco for + // inter-group requests. Upper bits (above $clog2(NumTilesPerGroup)) are + // the linear group index; lower bits are the local tile within the group. + logic [TileIDWidth-1:0] dst_tile_id; + } remote_group_user_t; + + `REQRSP_TYPEDEF_ALL(remote_group, narrow_addr_t, narrow_data_t, narrow_strb_t, remote_group_user_t) + + // XY mesh coordinates for a group. port_id selects the eject port (always 0 for single-link). + typedef struct packed { + logic [idx_width(NumGroupsX)-1:0] x; + logic [idx_width(NumGroupsY)-1:0] y; + logic port_id; + } group_xy_id_t; + + // Per-group tile index used by dispatch xbar selection. + typedef logic [idx_width(NumTilesPerGroup)-1:0] group_tile_sel_t; + + // Routing header embedded in every inter-group NoC flit. + typedef struct packed { + logic [3:0] collective_op; + group_xy_id_t src_id; + group_xy_id_t dst_id; + group_tile_sel_t src_tile_id; + portid_t src_port_id; + logic last; + } noc_group_hdr_t; + + // Inter-group NoC flit types (payload + routing header). + typedef struct packed { + remote_group_req_chan_t payload; + noc_group_hdr_t hdr; + } noc_group_req_t; + + typedef struct packed { + remote_group_rsp_chan_t payload; + noc_group_hdr_t hdr; + } noc_group_rsp_t; + + // Group ICache (L2 read-only cache control) + localparam int unsigned ROCacheNumAddrRules = 1; + typedef struct packed { + logic enable; + logic flush_valid; + axi_addr_t [ROCacheNumAddrRules-1:0] start_addr; + axi_addr_t [ROCacheNumAddrRules-1:0] end_addr; + } ro_cache_ctrl_t; ///////////////////// @@ -421,12 +546,24 @@ package cachepool_pkg; // AXI typedef bundles `AXI_TYPEDEF_ALL(spatz_axi_narrow, axi_addr_t, axi_narrow_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(spatz_axi_in, axi_addr_t, axi_id_in_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) - - `AXI_TYPEDEF_ALL(axi_uart, axi_addr_t, axi_uart_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(axi_csr_mst, axi_addr_t, axi_id_csr_mst_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(axi_csr_slv, axi_addr_t, axi_id_csr_slv_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Per-group AXI output: narrower ID (pre multi-group mux). + `AXI_TYPEDEF_ALL(spatz_axi_group_out, axi_addr_t, axi_id_group_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Wrapper-level external output type: ID narrowed from SpatzAxiIdOutWidth to WrapperAxiIdOutWidth. + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_out, axi_addr_t, axi_id_wrapper_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Wrapper-level external input type: narrow ID from SoC (WrapperAxiIdInWidth → SpatzAxiIdInWidth inside). + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_in, axi_addr_t, axi_id_wrapper_in_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // Wrapper-level external narrow output type: ID compressed from SpatzAxiUartIdWidth to WrapperAxiNarrowIdOutWidth. + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_narrow_out, axi_addr_t, axi_id_wrapper_narrow_out_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + + `AXI_TYPEDEF_ALL(axi_uart, axi_addr_t, axi_uart_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(axi_csr_mst, axi_addr_t, axi_id_csr_mst_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // Serialized CSR type: 1-bit ID output of axi_id_serialize, fed into the CSR mux slave ports. + `AXI_TYPEDEF_ALL(axi_csr_ser, axi_addr_t, axi_id_csr_ser_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(axi_csr_slv, axi_addr_t, axi_id_csr_slv_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // BootROM: wide data bus (same payload as cache), slv = post-mux (widened ID) + `AXI_TYPEDEF_ALL(axi_bootrom_slv, axi_addr_t, axi_bootrom_slv_id_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) /************************************************************** * FUNCTIONS diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index e01c0ac..0fa53fb 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -4,19 +4,11 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" `include "common_cells/assertions.svh" `include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" -`include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" `include "reqrsp_interface/typedef.svh" `include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" /// Tile implementation for CachePool module cachepool_tile @@ -24,7 +16,7 @@ module cachepool_tile import spatz_pkg::*; import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; - import snitch_icache_pkg::icache_events_t; + import snitch_icache_pkg::icache_l1_events_t; #( /// Width of physical address. parameter int unsigned AxiAddrWidth = 48, @@ -48,10 +40,6 @@ module cachepool_tile parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -66,10 +54,14 @@ module cachepool_tile /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// Tile ID Width parameter int unsigned TileIDWidth = 0, + /// Number of dedicated inter-group remote ports per xbar plane. + /// When 0, no inter-group ports are generated (single-group mode). + parameter int unsigned NumRemoteGroupPortCore = 0, + /// Number of tiles within a single group (passed to interco for + /// group-id extraction from the address). + parameter int unsigned NumTilesPerGroup = 0, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = '0, @@ -110,67 +102,76 @@ module cachepool_tile parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ - parameter int unsigned NrSramCfg = 1 + parameter int unsigned NrSramCfg = 1, + localparam int unsigned TotRGPorts = (NumRemoteGroupPortCore == 0) ? 0 : NumRemoteGroupPortCore*NrTCDMPortsPerCore-1 ) ( /// System clock. - input logic clk_i, + input logic clk_i, /// Asynchronous active high reset. This signal is assumed to be _async_. - input logic rst_ni, + input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, - /// End of Computing indicator to notify the host/tb - // output logic eoc_o, + input logic debug_req_i, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. - input logic [9:0] hart_base_id_i, + input logic [9:0] hart_base_id_i, /// Base address of cluster. TCDM and cluster peripheral location are derived from /// it. This signal is pseudo-static. - input axi_addr_t cluster_base_addr_i, + input axi_addr_t cluster_base_addr_i, /// Tile ID, internal ID, the base is always 0, in theory should not change during use - input remote_tile_sel_t tile_id_i, + input remote_tile_sel_t tile_id_i, /// Partitioning address - input axi_addr_t private_start_addr_i, + input axi_addr_t private_start_addr_i, /// AXI Narrow out-port (UART/Peripheral) - output axi_narrow_req_t [1:0] axi_out_req_o, - input axi_narrow_resp_t [1:0] axi_out_resp_i, + output axi_narrow_req_t [1:0] axi_out_req_o, + input axi_narrow_resp_t [1:0] axi_out_resp_i, /// Cache Refill ports - output cache_trans_req_t [NumL1CtrlTile-1:0] cache_refill_req_o, - input cache_trans_rsp_t [NumL1CtrlTile-1:0] cache_refill_rsp_i, + output cache_trans_req_t [NumL1CtrlTile-1:0] cache_refill_req_o, + input cache_trans_rsp_t [NumL1CtrlTile-1:0] cache_refill_rsp_i, /// Wide AXI ports to cluster level - output axi_out_req_t [TileNarrowAxiPorts-1:0] axi_wide_req_o, - input axi_out_resp_t [TileNarrowAxiPorts-1:0] axi_wide_rsp_i, + output axi_out_req_t [TileNarrowAxiPorts-1:0] axi_wide_req_o, + input axi_out_resp_t [TileNarrowAxiPorts-1:0] axi_wide_rsp_i, /// Remote Tile access ports (to remote tiles) - output tcdm_req_t [NumRemotePortTile-1:0] remote_req_o, - output remote_tile_sel_t [NumRemotePortTile-1:0] remote_req_dst_o, - input tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_i, - input logic [NumRemotePortTile-1:0] remote_rsp_ready_i, + output tcdm_req_t [NumRemotePortTile-1:0] remote_req_o, + output remote_tile_sel_t [NumRemotePortTile-1:0] remote_req_dst_o, + input tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_i, + input logic [NumRemotePortTile-1:0] remote_rsp_ready_i, /// Remote Tile access ports (from remote tiles) - input tcdm_req_t [NumRemotePortTile-1:0] remote_req_i, - output tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_o, - output logic [NumRemotePortTile-1:0] remote_rsp_ready_o, + input tcdm_req_t [NumRemotePortTile-1:0] remote_req_i, + output tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_o, + output logic [NumRemotePortTile-1:0] remote_rsp_ready_o, + /// Inter-group remote access ports (to other groups). + /// Flat layout: flat index = j + r * NrTCDMPortsPerCore, + /// where j is the interco instance and r is the inter-group remote slot. + /// Total count: NumRemoteGroupPortCore * NrTCDMPortsPerCore. + /// Uses REQRSP-style types with built-in ready and remote_group_user_t. + output remote_group_req_t [TotRGPorts:0] remote_group_req_o, + input remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_i, + /// Inter-group remote access ports (from other groups) + input remote_group_req_t [TotRGPorts:0] remote_group_req_i, + output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, /// Peripheral signals - output icache_events_t [NrCores-1:0] icache_events_o, - input logic icache_prefetch_enable_i, - input logic [NrCores-1:0] cl_interrupt_i, - input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, - input cache_insn_t l1d_insn_i, - input logic [3:0] l1d_private_i, - input logic l1d_insn_valid_i, - output logic l1d_insn_ready_o, - input logic l1d_busy_i, + output icache_l1_events_t [NrCores-1:0] icache_events_o, + input logic icache_prefetch_enable_i, + input logic [NrCores-1:0] cl_interrupt_i, + input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, + input cache_insn_t l1d_insn_i, + input logic [3:0] l1d_private_i, + input logic l1d_insn_valid_i, + output logic l1d_insn_ready_o, + input logic l1d_busy_i, @@ -189,7 +190,6 @@ module cachepool_tile // --------- // TODO: Should be imported from Memory-mapped Reg logic [2:0] num_private_cache; - // half-half assign num_private_cache = l1d_private_i[2:0]; /// Minimum width to hold the core number. @@ -304,11 +304,6 @@ module cachepool_tile `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, tcdm_user_t) - `MEM_TYPEDEF_ALL(mem, tcdm_mem_addr_t, data_t, strb_t, tcdm_user_t) - - `REG_BUS_TYPEDEF_ALL(reg, addr_t, data_t, strb_t) - - typedef struct packed { int unsigned idx; addr_t start_addr; @@ -405,7 +400,7 @@ module cachepool_tile core_events_t [NrCores-1:0] core_events; - snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; + // snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; // 4. Memory Subsystem (Core side). reqrsp_req_t [NrCores-1:0] core_req, filtered_core_req; @@ -418,6 +413,12 @@ module cachepool_tile tcdm_req_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_req, cache_xbar_req; tcdm_rsp_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_rsp, cache_xbar_rsp; + // Post-xbar gated copies. + // cache_ctrl_req : xbar output with q_valid suppressed during flush. + // cache_bank_rsp : raw response from the bank/AMO stage; q_ready is gated before + // being returned to the interco as cache_xbar_rsp. + tcdm_req_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_ctrl_req; + tcdm_rsp_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_bank_rsp; tcdm_req_t [NumL1CtrlTile-1:0] cache_amo_req; tcdm_rsp_t [NumL1CtrlTile-1:0] cache_amo_rsp; @@ -517,16 +518,14 @@ module cachepool_tile always_comb begin : cache_flush_protection for (int j = 0; unsigned'(j) < NrTCDMPortsCores; j++) begin /***** REQ *****/ - // Wire to Cache outputs unmerge_req[j].q = tcdm_req[j].q; - // invalidate the request when cache is busy - unmerge_req[j].q_valid = tcdm_req[j].q_valid && !l1d_busy_i; + unmerge_req[j].q_valid = tcdm_req[j].q_valid; unmerge_pready[j] = 1'b1; /***** RSP *****/ tcdm_rsp[j].p = unmerge_rsp[j].p; tcdm_rsp[j].p_valid = unmerge_rsp[j].p_valid; - tcdm_rsp[j].q_ready = unmerge_rsp[j].q_ready && !l1d_busy_i; + tcdm_rsp[j].q_ready = unmerge_rsp[j].q_ready; end end @@ -545,25 +544,21 @@ module cachepool_tile // Used to determine the mapping policy between different cache banks. // Set through CSR - logic [$clog2(TCDMAddrWidth)-1:0] dynamic_offset; + logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset; assign dynamic_offset = dynamic_offset_i; // One entry per flat remote port: flat index = j + r*NrTCDMPortsPerCore // where j is the xbar index and r is the remote slot within that xbar. logic [NumRemotePortTile-1:0] remote_out_pready, remote_in_pready; - // Flush protection for remote ports. - // - // During a flush (l1d_busy_i) remote tiles must be fully stalled: - // - q_valid gated : stops new requests being presented to the xbar - // - q_ready gated : stops the xbar accepting a request that is already - // sitting at the input (spill register would otherwise - // pop it, and the transaction would be lost because the - // cache is unavailable) - // - remote_in_pready gated : stops response-ready from propagating back, - // preventing in-flight completions during the flush window + // Intra-group remote port wiring. + // q_valid and q_ready for incoming requests are passed through without gating: + // the after-xbar flush gate (cache_xbar_flush_gate) provides the authoritative + // protection at the cache bank boundary and naturally back-pressures through + // the interco to the remote sender. + // response-ready (remote_in_pready) is still gated to prevent draining in-flight + // completions during the flush window. tcdm_req_t [NumRemotePortTile-1:0] remote_req_gated; - // Intermediate response signals from the xbar before q_ready gating. tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_xbar; always_comb begin : remote_flush_protection @@ -571,14 +566,10 @@ module cachepool_tile for (int r = 0; r < NumRemotePortCore; r++) begin automatic int unsigned flat = j + r * NrTCDMPortsPerCore; - // Gate q_valid: prevent new requests entering the xbar. remote_req_gated[flat].q = remote_req_i[flat].q; - remote_req_gated[flat].q_valid = remote_req_i[flat].q_valid && !l1d_busy_i; + remote_req_gated[flat].q_valid = remote_req_i[flat].q_valid; - // Pass the full xbar response through, then gate only q_ready so the - // remote tile cannot complete a handshake during a flush. remote_rsp_o[flat] = remote_rsp_xbar[flat]; - remote_rsp_o[flat].q_ready = remote_rsp_xbar[flat].q_ready && !l1d_busy_i; // Gate response-ready back to us: prevent draining completions // of requests that arrived just before the flush. @@ -589,9 +580,149 @@ module cachepool_tile assign remote_rsp_ready_o = remote_out_pready; + // ------------------------------------------------------------------------- + // Inter-group remote ports – type conversion and flush protection + // ------------------------------------------------------------------------- + // External ports use REQRSP-style remote_group_req_t / remote_group_rsp_t + // (with built-in ready and remote_group_user_t). + // Internal interco uses TCDM-style tcdm_req_t / tcdm_rsp_t. + // This section bridges the two and applies flush gating. + // + // Same flat layout as remote ports: flat = j + r * NrTCDMPortsPerCore. + // Total count: NumRemoteGroupPortCore * NrTCDMPortsPerCore. + + localparam int unsigned NumRemoteGroupPortTile = NumRemoteGroupPortCore * NrTCDMPortsPerCore; + + // Internal TCDM-style signals going to/from the interco. + tcdm_req_t [NumRemoteGroupPortTile-1:0] rg_interco_in_req; // incoming requests to interco + tcdm_rsp_t [NumRemoteGroupPortTile-1:0] rg_interco_in_rsp; // responses from interco (for incoming) + logic [NumRemoteGroupPortTile-1:0] rg_interco_in_pready; // response ready for incoming + + tcdm_req_t [NumRemoteGroupPortTile-1:0] rg_interco_out_req; // outgoing requests from interco + tcdm_rsp_t [NumRemoteGroupPortTile-1:0] rg_interco_out_rsp; // responses returning (for outgoing) + logic [NumRemoteGroupPortTile-1:0] rg_interco_out_pready;// response ready for outgoing + remote_tile_sel_t [NumRemoteGroupPortTile-1:0] rg_interco_out_dst; // target tile from interco + + if (NumRemoteGroupPortCore > 0) begin : gen_remote_group_ports + always_comb begin + for (int j = 0; j < NrTCDMPortsPerCore; j++) begin + for (int r = 0; r < NumRemoteGroupPortCore; r++) begin + automatic int unsigned flat = j + r * NrTCDMPortsPerCore; + + // ----------------------------------------------------------- + // Incoming: REQRSP → TCDM conversion → interco + // q_valid and q_ready are passed through without gating; the + // after-xbar flush gate (cache_xbar_flush_gate) is the authoritative + // protection point and naturally back-pressures through the interco. + // ----------------------------------------------------------- + rg_interco_in_req[flat] = '{ + q: '{ + addr: remote_group_req_i[flat].q.addr, + write: remote_group_req_i[flat].q.write, + data: remote_group_req_i[flat].q.data, + strb: remote_group_req_i[flat].q.strb, + amo: remote_group_req_i[flat].q.amo, + user: '{ + core_id: remote_group_req_i[flat].q.user.core_id, + tile_id: remote_group_req_i[flat].q.user.tile_id, + req_id: remote_group_req_i[flat].q.user.req_id, + is_fpu: remote_group_req_i[flat].q.user.is_fpu, + default: '0 + }, + default: '0 + }, + q_valid: remote_group_req_i[flat].q_valid, + default: '0 + }; + + // Interco response (TCDM) → REQRSP for remote_group_rsp_o. + remote_group_rsp_o[flat] = '{ + p: '{ + data: rg_interco_in_rsp[flat].p.data, + write: rg_interco_in_rsp[flat].p.write, + user: '{ + core_id: rg_interco_in_rsp[flat].p.user.core_id, + tile_id: rg_interco_in_rsp[flat].p.user.tile_id, + req_id: rg_interco_in_rsp[flat].p.user.req_id, + is_fpu: rg_interco_in_rsp[flat].p.user.is_fpu, + port_id: portid_t'(j), + default: '0 + }, + default: '0 + }, + p_valid: rg_interco_in_rsp[flat].p_valid, + q_ready: rg_interco_in_rsp[flat].q_ready, + default: '0 + }; + + // Response ready from the external port (REQRSP p_ready). + rg_interco_in_pready[flat] = remote_group_req_i[flat].p_ready && !l1d_busy_i; + + // ----------------------------------------------------------- + // Outgoing: interco → flush gating → TCDM to REQRSP → output + // ----------------------------------------------------------- + remote_group_req_o[flat] = '{ + q: '{ + addr: rg_interco_out_req[flat].q.addr, + write: rg_interco_out_req[flat].q.write, + data: rg_interco_out_req[flat].q.data, + strb: rg_interco_out_req[flat].q.strb, + amo: rg_interco_out_req[flat].q.amo, + user: '{ + core_id: rg_interco_out_req[flat].q.user.core_id, + tile_id: rg_interco_out_req[flat].q.user.tile_id, + req_id: rg_interco_out_req[flat].q.user.req_id, + is_fpu: rg_interco_out_req[flat].q.user.is_fpu, + port_id: portid_t'(j), + dst_tile_id: rg_interco_out_dst[flat], + default: '0 + }, + default: '0 + }, + q_valid: rg_interco_out_req[flat].q_valid && !l1d_busy_i, + p_ready: rg_interco_out_pready[flat] && !l1d_busy_i, + default: '0 + }; + + // Returning response (REQRSP) → TCDM for the interco. + rg_interco_out_rsp[flat] = '{ + p: '{ + data: remote_group_rsp_i[flat].p.data, + write: remote_group_rsp_i[flat].p.write, + user: '{ + core_id: remote_group_rsp_i[flat].p.user.core_id, + tile_id: remote_group_rsp_i[flat].p.user.tile_id, + req_id: remote_group_rsp_i[flat].p.user.req_id, + is_fpu: remote_group_rsp_i[flat].p.user.is_fpu, + default: '0 + }, + default: '0 + }, + p_valid: remote_group_rsp_i[flat].p_valid, + q_ready: remote_group_rsp_i[flat].q_ready, + default: '0 + }; + end + end + end + end else begin : gen_remote_group_no_ports + // No inter-group remote ports: tie off outputs. + assign remote_group_rsp_o = '0; + assign remote_group_req_o = '0; + assign rg_interco_in_req = '0; + assign rg_interco_in_pready = '0; + assign rg_interco_out_rsp = '0; + assign rg_interco_out_pready = '0; + assign rg_interco_in_rsp = '0; + assign rg_interco_out_req = '0; + assign rg_interco_out_dst = '0; + end + /// Wire requests after strb handling to the cache controller. /// Each xbar j handles NumRemotePortCore remote slots at flat indices /// j + r*NrTCDMPortsPerCore for r in [0, NumRemotePortCore). + /// Similarly, each xbar j handles NumRemoteGroupPortCore inter-group remote slots at flat indices + /// j + r*NrTCDMPortsPerCore for r in [0, NumRemoteGroupPortCore). for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin : gen_cache_xbar // Collect the NumRemotePortCore remote slots for this xbar. tcdm_req_t [NumRemotePortCore-1:0] xbar_remote_req_gated; @@ -613,33 +744,92 @@ module cachepool_tile assign remote_req_o [flat] = xbar_remote_req_o [r]; end - tcdm_cache_interco #( - .NumTiles (NumTiles ), - .NumCores (NrCores ), - .NumCache (NumL1CtrlTile ), - .NumTotCache (NumL1CacheCtrl ), - .NumRemotePort (NumRemotePortCore ), - .AddrWidth (TCDMAddrWidth ), - .TileIDWidth (TileIDWidth ), - .tcdm_req_t (tcdm_req_t ), - .tcdm_rsp_t (tcdm_rsp_t ), - .tcdm_req_chan_t (tcdm_req_chan_t ), - .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) - ) i_cache_xbar ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .tile_id_i ( tile_id_i ), - .dynamic_offset_i ( dynamic_offset ), - .private_start_addr_i ( private_start_addr_i ), - .num_private_cache_i ( num_private_cache ), - .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), - .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), - .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), - .tile_sel_o ( xbar_remote_req_dst ), - .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]} ), - .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]} ), - .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]} ) - ); + // Collect the NumRemoteGroupPortCore inter-group remote slots for this xbar (same flat layout). + // When NumRemoteGroupPortCore == 0, no inter-group remote signals exist and the interco is + // instantiated without inter-group remote ports (backward-compatible). + if (NumRemoteGroupPortCore > 0) begin : gen_remote_group_slice + tcdm_req_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_req; + tcdm_rsp_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_rsp; + logic [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_pready; + tcdm_req_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_req; + tcdm_rsp_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_rsp; + logic [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_pready; + remote_tile_sel_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_dst; + + for (genvar r = 0; r < NumRemoteGroupPortCore; r++) begin : gen_remote_group_slice_r + localparam int unsigned flat = j + r * NrTCDMPortsPerCore; + // Incoming: from conversion/flush → interco input + assign xbar_remote_group_in_req [r] = rg_interco_in_req [flat]; + assign xbar_remote_group_in_pready [r] = rg_interco_in_pready [flat]; + assign rg_interco_in_rsp [flat] = xbar_remote_group_in_rsp [r]; + // Outgoing: interco output → conversion/flush + assign rg_interco_out_req [flat] = xbar_remote_group_out_req [r]; + assign rg_interco_out_dst [flat] = xbar_remote_group_out_dst [r]; + assign xbar_remote_group_out_rsp [r] = rg_interco_out_rsp [flat]; + assign rg_interco_out_pready [flat] = xbar_remote_group_out_pready[r]; + end + + tcdm_cache_interco #( + .NumTiles (NumTiles ), + .NumCores (NrCores ), + .NumCache (NumL1CtrlTile ), + .NumTotCache (NumL1CacheCtrl ), + .NumRemotePort (NumRemotePortCore ), + .NumRemoteGroupPort (NumRemoteGroupPortCore ), + .NumTilesPerGroup (NumTilesPerGroup ), + .AddrWidth (TCDMAddrWidth ), + .TileIDWidth (TileIDWidth ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) + ) i_cache_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tile_id_i ( tile_id_i ), + .dynamic_offset_i ( dynamic_offset ), + .private_start_addr_i ( private_start_addr_i ), + .num_private_cache_i ( num_private_cache ), + .core_req_i ({xbar_remote_group_in_req, xbar_remote_req_gated, cache_req [j]}), + .core_rsp_ready_i ({xbar_remote_group_in_pready, xbar_remote_in_pready, cache_pready [j]}), + .core_rsp_o ({xbar_remote_group_in_rsp, xbar_remote_rsp_xbar, cache_rsp [j]}), + .tile_sel_o ( xbar_remote_req_dst ), + .remote_group_sel_o ( xbar_remote_group_out_dst ), + .mem_req_o ({xbar_remote_group_out_req, xbar_remote_req_o, cache_xbar_req [j]}), + .mem_rsp_ready_o ({xbar_remote_group_out_pready, xbar_remote_out_pready, cache_xbar_pready[j]}), + .mem_rsp_i ({xbar_remote_group_out_rsp, xbar_remote_rsp_i, cache_xbar_rsp [j]}) + ); + end else begin : gen_no_remote_group + // No inter-group remote ports: instantiate interco without inter-group remote ports (backward-compatible). + tcdm_cache_interco #( + .NumTiles (NumTiles ), + .NumCores (NrCores ), + .NumCache (NumL1CtrlTile ), + .NumTotCache (NumL1CacheCtrl ), + .NumRemotePort (NumRemotePortCore ), + .AddrWidth (TCDMAddrWidth ), + .TileIDWidth (TileIDWidth ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) + ) i_cache_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tile_id_i ( tile_id_i ), + .dynamic_offset_i ( dynamic_offset ), + .private_start_addr_i ( private_start_addr_i ), + .num_private_cache_i ( num_private_cache ), + .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), + .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), + .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), + .tile_sel_o ( xbar_remote_req_dst ), + .remote_group_sel_o ( ), + .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]}), + .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]}), + .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]}) + ); + end end for (genvar cb = 0; cb < NumL1CtrlTile; cb++) begin : gen_cache_connect @@ -659,9 +849,9 @@ module cachepool_tile ) i_cache_amo ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .core_req_i (cache_xbar_req [j][cb] ), + .core_req_i (cache_ctrl_req [j][cb] ), .core_rsp_ready_i (cache_xbar_pready[j][cb] ), - .core_rsp_o (cache_xbar_rsp [j][cb] ), + .core_rsp_o (cache_bank_rsp [j][cb] ), .mem_req_o (cache_amo_req [cb] ), .mem_rsp_ready_o (cache_amo_pready [cb] ), .mem_rsp_i (cache_amo_rsp [cb] ) @@ -713,22 +903,67 @@ module cachepool_tile assign cache_rsp_reg.p.write = cache_rsp_write[cb][j]; end else begin : gen_no_amo - // Bypass AMO and registers - assign cache_req_valid[cb][j] = cache_xbar_req [j][cb].q_valid; - assign cache_rsp_ready[cb][j] = cache_xbar_pready[j][cb]; - assign cache_req_addr [cb][j] = cache_xbar_req [j][cb].q.addr; - assign cache_req_meta [cb][j] = cache_xbar_req [j][cb].q.user; - assign cache_req_write[cb][j] = cache_xbar_req [j][cb].q.write; - assign cache_req_data [cb][j] = cache_xbar_req [j][cb].q.data; - assign cache_req_strb [cb][j] = cache_xbar_req [j][cb].q.strb; - - assign cache_xbar_rsp[j][cb].p_valid = cache_rsp_valid[cb][j]; - assign cache_xbar_rsp[j][cb].q_ready = cache_req_ready[cb][j]; - assign cache_xbar_rsp[j][cb].p.data = cache_rsp_data [cb][j]; - assign cache_xbar_rsp[j][cb].p.user = cache_rsp_meta [cb][j]; - - assign cache_xbar_rsp[j][cb].p.write = cache_rsp_write[cb][j]; + // Spill registers to cut the L1 xbar → coalescer critical path, + // matching the timing budget of the Snitch AMO path above. + tcdm_req_t cache_req_reg; + tcdm_rsp_t cache_rsp_reg; + + spill_register #( + .T ( tcdm_req_chan_t ), + .Bypass ( 1'b0 ) + ) i_spill_reg_cache_req ( + .clk_i , + .rst_ni ( rst_ni ), + .valid_i ( cache_ctrl_req[j][cb].q_valid ), + .ready_o ( cache_bank_rsp[j][cb].q_ready ), + .data_i ( cache_ctrl_req[j][cb].q ), + .valid_o ( cache_req_reg.q_valid ), + .ready_i ( cache_rsp_reg.q_ready ), + .data_o ( cache_req_reg.q ) + ); + spill_register #( + .T ( tcdm_rsp_chan_t ), + .Bypass ( 1'b1 ) + ) i_spill_reg_cache_rsp ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .valid_i ( cache_rsp_reg.p_valid ), + .ready_o ( cache_rsp_ready[cb][j] ), + .data_i ( cache_rsp_reg.p ), + .valid_o ( cache_bank_rsp[j][cb].p_valid ), + .ready_i ( cache_xbar_pready[j][cb] ), + .data_o ( cache_bank_rsp[j][cb].p ) + ); + + assign cache_req_valid[cb][j] = cache_req_reg.q_valid; + assign cache_req_addr [cb][j] = cache_req_reg.q.addr; + assign cache_req_meta [cb][j] = cache_req_reg.q.user; + assign cache_req_write[cb][j] = cache_req_reg.q.write; + assign cache_req_data [cb][j] = cache_req_reg.q.data; + assign cache_req_strb [cb][j] = cache_req_reg.q.strb; + + assign cache_rsp_reg.p_valid = cache_rsp_valid[cb][j]; + assign cache_rsp_reg.q_ready = cache_req_ready[cb][j]; + assign cache_rsp_reg.p.data = cache_rsp_data [cb][j]; + assign cache_rsp_reg.p.user = cache_rsp_meta [cb][j]; + assign cache_rsp_reg.p.write = cache_rsp_write[cb][j]; + + end + end + end + + // Post-xbar flush gate (applied uniformly across all ports). + // Suppresses q_valid going into the bank so no new cache accesses are processed + // while a flush is in progress, and gates q_ready going back to the interco so the + // xbar cannot dequeue a buffered request that is already sitting at its output. + always_comb begin : cache_xbar_flush_gate + for (int j = 0; j < NrTCDMPortsPerCore; j++) begin + for (int cb = 0; cb < NumL1CtrlTile; cb++) begin + cache_ctrl_req[j][cb] = cache_xbar_req[j][cb]; + cache_ctrl_req[j][cb].q_valid = cache_xbar_req[j][cb].q_valid && !l1d_busy_i; + cache_xbar_rsp[j][cb] = cache_bank_rsp[j][cb]; + cache_xbar_rsp[j][cb].q_ready = cache_bank_rsp[j][cb].q_ready && !l1d_busy_i; end end end @@ -745,6 +980,7 @@ module cachepool_tile localparam NumWordPerLine = L1LineWidth / DataWidth; localparam int unsigned WordBytes = DataWidth / 8; +`ifndef TARGET_SYNTHESIS initial begin $display("Cache Configuration:"); $display(" NumCtrl : %0d", NumL1CtrlTile); @@ -759,6 +995,7 @@ module cachepool_tile $display(" RefillDataWidth: %0d", RefillDataWidth); $display(" DynamicOffset : %0d", dynamic_offset); end +`endif // CL-offset mask: bits below dynamic_offset, verbatim in both directions. logic [SpatzAxiAddrWidth-1:0] bitmask_lo; @@ -885,6 +1122,7 @@ module cachepool_tile .CacheLineWidth (L1LineWidth ), .SetAssociativity (L1AssoPerCtrl ), .BankFactor (L1BankFactor ), + // .LogDebug (0 ), .RefillDataWidth (RefillDataWidth ), // Type .core_meta_t (tcdm_user_t ), @@ -1056,7 +1294,7 @@ module cachepool_tile .clk_i (clk_i ), .rst_ni (rst_ni ), .impl_i ('0 ), - .impl_o (/* unsed */ ), + .impl_o (/* unused */ ), .req_i (l1_tag_bank_req [cb][j]), .we_i (l1_tag_bank_we [cb][j]), .addr_i (l1_tag_bank_addr [cb][j]), @@ -1087,7 +1325,7 @@ module cachepool_tile .clk_i (clk_i ), .rst_ni (rst_ni ), .impl_i ('0 ), - .impl_o (/* unsed */ ), + .impl_o (/* unused */ ), .req_i ( l1_data_bank_req [cb][BaseIdx] ), .we_i ( l1_data_bank_we [cb][BaseIdx] ), .addr_i ( l1_data_bank_addr [cb][BaseIdx] ), @@ -1111,7 +1349,7 @@ module cachepool_tile // .clk_i (clk_i ), // .rst_ni (rst_ni ), // .impl_i ('0 ), - // .impl_o (/* unsed */ ), + // .impl_o (/* unused */ ), // .req_i (l1_data_bank_req [cb][j]), // .we_i (l1_data_bank_we [cb][j]), // .addr_i (l1_data_bank_addr [cb][j]), @@ -1134,13 +1372,13 @@ module cachepool_tile interrupts_t irq; sync #(.STAGES (2)) - i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i[i]), .serial_o (irq.debug)); + i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i), .serial_o (irq.debug)); sync #(.STAGES (2)) - i_sync_meip (.clk_i, .rst_ni, .serial_i (meip_i[i]), .serial_o (irq.meip)); + i_sync_meip (.clk_i, .rst_ni, .serial_i (meip_i), .serial_o (irq.meip)); sync #(.STAGES (2)) - i_sync_mtip (.clk_i, .rst_ni, .serial_i (mtip_i[i]), .serial_o (irq.mtip)); + i_sync_mtip (.clk_i, .rst_ni, .serial_i (mtip_i), .serial_o (irq.mtip)); sync #(.STAGES (2)) - i_sync_msip (.clk_i, .rst_ni, .serial_i (msip_i[i]), .serial_o (irq.msip)); + i_sync_msip (.clk_i, .rst_ni, .serial_i (msip_i), .serial_o (irq.msip)); assign irq.mcip = cl_interrupt_i[i]; tcdm_req_t [TcdmPorts-1:0] tcdm_req_wo_user; @@ -1155,15 +1393,10 @@ module cachepool_tile .RVF (RVF ), .RVD (RVD ), .RVV (RVV ), - .Xdma (Xdma[i] ), .AddrWidth (AxiAddrWidth ), .DataWidth (NarrowDataWidth ), .UserWidth (AxiUserWidth ), - .DMADataWidth (AxiDataWidth ), - .DMAIdWidth (AxiIdWidthIn ), .SnitchPMACfg (SnitchPMACfg ), - .DMAAxiReqFifoDepth (DMAAxiReqFifoDepth ), - .DMAReqFifoDepth (DMAReqFifoDepth ), .dreq_t (reqrsp_req_t ), .drsp_t (reqrsp_rsp_t ), .dreq_chan_t (reqrsp_req_chan_t ), @@ -1255,14 +1488,14 @@ module cachepool_tile .L0_LINE_COUNT ( 8 ), .LINE_WIDTH ( ICacheLineWidth ), .LINE_COUNT ( ICacheLineCount ), - .SET_COUNT ( ICacheSets ), + .WAY_COUNT ( ICacheSets ), .FETCH_AW ( AxiAddrWidth ), .FETCH_DW ( 32 ), .FILL_AW ( AxiAddrWidth ), .FILL_DW ( AxiDataWidth ), .EARLY_LATCH ( 0 ), .L0_EARLY_TAG_WIDTH ( snitch_pkg::PAGE_SHIFT - $clog2(ICacheLineWidth/8) ), - .ISO_CROSSING ( 1'b0 ), + .ISO_CROSSING ( 1'b1 ), .axi_req_t ( axi_mst_tile_wide_req_t ), .axi_rsp_t ( axi_mst_tile_wide_resp_t ), .sram_cfg_data_t ( impl_in_t ), @@ -1272,7 +1505,9 @@ module cachepool_tile .clk_d2_i ( clk_i ), .rst_ni ( rst_ni ), .enable_prefetching_i ( icache_prefetch_enable_i ), - .icache_events_o ( icache_events_o ), + .enable_branch_pred_i ( '0 ), + .icache_l0_events_o ( ), + .icache_l1_events_o ( ), .flush_valid_i ( flush_valid ), .flush_ready_o ( flush_ready ), .inst_addr_i ( inst_addr ), @@ -1283,6 +1518,8 @@ module cachepool_tile .inst_error_o ( inst_error ), .sram_cfg_tag_i ( '0 ), .sram_cfg_data_i ( '0 ), + .sram_cfg_out_data_o (), + .sram_cfg_out_tag_o (), .axi_req_o ( wide_axi_mst_req[ICache] ), .axi_rsp_i ( wide_axi_mst_rsp[ICache] ) ); @@ -1471,12 +1708,7 @@ module cachepool_tile // ------------- // Sanity Checks // ------------- - // Sanity check the parameters. Not every configuration makes sense. - `ASSERT_INIT(CheckSuperBankSanity, NrBanks >= BanksPerSuperBank); - `ASSERT_INIT(CheckSuperBankFactor, (NrBanks % BanksPerSuperBank) == 0); // Check that the cluster base address aligns to the TCDMSize. `ASSERT(ClusterBaseAddrAlign, ((TCDMSize - 1) & cluster_base_addr_i) == 0) - // Make sure we only have one DMA in the system. - `ASSERT_INIT(NumberDMA, $onehot0(Xdma)) endmodule diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv index ba49e11..5287208 100644 --- a/hardware/src/tcdm_cache_interco.sv +++ b/hardware/src/tcdm_cache_interco.sv @@ -21,24 +21,57 @@ // private_bank = addr_bank_bits % num_private_cache_q // shared_bank = num_private_cache_q + (addr_bank_bits % num_shared_cache_q) // For non-power-of-2 partition sizes this causes uneven bank utilisation. +// +// Multi-group support (NumRemoteGroupPort > 0): +// +// When the cluster contains multiple groups, tile IDs are globally unique +// and encode both the group and tile-within-group: +// tile_id = {group_id, local_tile_id} +// +// The xbar performs three-way routing for shared (non-private) requests: +// 1. Local : same tile -> local cache bank +// 2. Intra-group : same group, diff tile -> remote port (existing xbar) +// 3. Inter-group : different group -> inter-group remote port (new) +// +// inter-group remote ports are appended after the remote ports on both input and output +// sides of the xbar, preserving full backward compatibility when +// NumRemoteGroupPort == 0. `include "common_cells/registers.svh" +`include "common_cells/assertions.svh" module tcdm_cache_interco #( /// Number of Tiles ('>= 1') parameter int unsigned NumTiles = 32'd1, /// Number of inputs into the interconnect (Cores per Tile) (`> 0`). parameter int unsigned NumCores = 32'd0, - /// Number of remote ports added to xbar ('>= 0'). + /// Number of remote ports added to xbar for intra-group traffic ('>= 0'). parameter int unsigned NumRemotePort = 32'd0, + /// Number of dedicated inter-group remote ports ('>= 0'). + /// When 0, the module behaves identically to the single-group configuration. + /// Each inter-group remote port serves as both an output (requests to other groups) and an + /// input (requests arriving from other groups), mirroring NumRemotePort. + parameter int unsigned NumRemoteGroupPort = 32'd0, /// Number of outputs from the interconnect (Cache banks per Tile) (`> 0`). parameter int unsigned NumCache = 32'd0, /// Number of total cache banks across all tiles (used for address scramble). + /// For multi-group, this must cover all tiles across all groups. parameter int unsigned NumTotCache = 32'd0, /// Address width in bits (cacheline offset: 512b => 6 bits). parameter int unsigned AddrWidth = 32'd32, /// Tile ID width ('> 0'). + /// In multi-group configurations, TileIDWidth covers the globally unique + /// tile ID which encodes both group and tile-within-group: + /// tile_id = {group_id, local_tile_id} parameter int unsigned TileIDWidth = 32'd1, + /// DRAM base address, used to check if we get illegal access + parameter int unsigned DramBaseAddr = 32'h8000_0000, + /// Number of tiles within a single group. + /// Used to extract the group portion from the address tile field: + /// group_id = addr_tile_bits / NumTilesPerGroup + /// Only relevant when NumRemoteGroupPort > 0. Defaults to NumTiles for + /// backward compatibility (single-group: all tiles are in one group). + parameter int unsigned NumTilesPerGroup = NumTiles, /// Port type of the data request ports. parameter type tcdm_req_t = logic, @@ -52,7 +85,9 @@ module tcdm_cache_interco #( parameter snitch_pkg::topo_e Topology = snitch_pkg::LogarithmicInterconnect, /// Dependency parameters – do not override. parameter type tile_id_t = logic [TileIDWidth-1:0], - parameter type addr_t = logic [AddrWidth-1:0] + parameter type addr_t = logic [AddrWidth-1:0], + localparam TotInPorts = NumCores+NumRemotePort+NumRemoteGroupPort, + localparam TotOutPorts = NumCache+NumRemotePort+NumRemoteGroupPort ) ( /// Clock, positive edge triggered. @@ -68,69 +103,79 @@ module tcdm_cache_interco #( input logic [$clog2(NumCache):0] num_private_cache_i, /// Partitioning address input addr_t private_start_addr_i, - /// Request port (cores + remote-in) ---------------------------------- - input tcdm_req_t [NumCores+NumRemotePort-1:0] core_req_i, + /// Request port (cores + intra-group remote-in + inter-group remote-in). + input tcdm_req_t [TotInPorts-1:0] core_req_i, /// Response ready in. - input logic [NumCores+NumRemotePort-1:0] core_rsp_ready_i, - /// Response port (cores + remote-in). - output tcdm_rsp_t [NumCores+NumRemotePort-1:0] core_rsp_o, + input logic [TotInPorts-1:0] core_rsp_ready_i, + /// Response port (cores + intra-group remote-in + inter-group remote-in). + output tcdm_rsp_t [TotInPorts-1:0] core_rsp_o, /// Memory side ------------------------------------------------------- - /// Which remote tile is targeted (one entry per remote output port). + /// Which remote tile is targeted (one entry per intra-group remote output). output tile_id_t [NumRemotePort-1:0] tile_sel_o, - // output logic remote_group_o, - /// Requests to cache banks and remote output ports. - output tcdm_req_t [NumCache+NumRemotePort-1:0] mem_req_o, + /// Which tile is targeted via inter-group remote (one entry per inter-group remote output). + /// Carries the full globally-unique tile ID; the wrapper decomposes it + /// into group XY coordinates for the router and local tile ID for the + /// receiving-side xbar. + output tile_id_t [NumRemoteGroupPort-1:0] remote_group_sel_o, + /// Requests to cache banks, intra-group remote, and inter-group remote ports. + output tcdm_req_t [TotOutPorts-1:0] mem_req_o, /// Response ready out. - output logic [NumCache+NumRemotePort-1:0] mem_rsp_ready_o, - /// Responses from cache banks and remote output ports. - input tcdm_rsp_t [NumCache+NumRemotePort-1:0] mem_rsp_i + output logic [TotOutPorts-1:0] mem_rsp_ready_o, + /// Responses from cache banks, intra-group remote, and inter-group remote ports. + input tcdm_rsp_t [TotOutPorts-1:0] mem_rsp_i ); // ------------------------------------------------------------------------- // Local parameters // ------------------------------------------------------------------------- - // Bits to index into xbar outputs (local banks + one remote slot). - localparam int unsigned NumOutSelBits = $clog2(NumCache + NumRemotePort); + // Total number of xbar input and output ports. + localparam int unsigned NumInp = NumCores + NumRemotePort + NumRemoteGroupPort; + localparam int unsigned NumOut = NumCache + NumRemotePort + NumRemoteGroupPort; + // Bits to index into xbar outputs. + localparam int unsigned NumOutSelBits = $clog2(NumOut); // Bits to index into xbar inputs. - localparam int unsigned NumInpSelBits = $clog2(NumCores + NumRemotePort); + localparam int unsigned NumInpSelBits = $clog2(NumInp); // Bits needed to select among local cache banks. - localparam int unsigned CacheBankBits = $clog2(NumCache); + localparam int unsigned CacheBankBits = $clog2(NumCache); // Bits needed to select the tile in the shared address space. - // Equals TileIDWidth by construction (NumTotCache / NumCache == NumTiles). - localparam int unsigned TileBits = $clog2(NumTotCache / NumCache); + // Equals TileIDWidth by construction (NumTotCache / NumCache == NumTotalTiles). + localparam int unsigned TileBits = $clog2(NumTotCache / NumCache); + + // Group extraction: number of bits to identify the group within TileID. + // LocalTileBits = $clog2(NumTilesPerGroup); GroupBits = TileBits - LocalTileBits. + // Only meaningful when NumRemoteGroupPort > 0. + localparam int unsigned LocalTileBits = $clog2(NumTilesPerGroup); // ------------------------------------------------------------------------- // Types // ------------------------------------------------------------------------- typedef logic [NumInpSelBits-1:0] mem_sel_t; - typedef logic [NumOutSelBits -1:0] core_sel_t; + typedef logic [NumOutSelBits-1:0] core_sel_t; // ------------------------------------------------------------------------- // Internal signals // ------------------------------------------------------------------------- // Xbar routing signals. - core_sel_t [NumCores+NumRemotePort-1:0] core_req_sel; - mem_sel_t [NumCache+NumRemotePort-1:0] mem_rsp_sel; - // '1' when this request stays on local banks. - logic [NumCores+NumRemotePort-1:0] local_sel; + core_sel_t [NumInp-1:0] core_req_sel; + mem_sel_t [NumOut-1:0] mem_rsp_sel; // '1' when a request targets the private partition. - logic [NumCores+NumRemotePort-1:0] is_private; + logic [NumInp-1:0] is_private; // Xbar channel signals. - tcdm_req_chan_t [NumCores+NumRemotePort-1:0] core_req; - logic [NumCores+NumRemotePort-1:0] core_req_valid, core_req_ready; + tcdm_req_chan_t [NumInp-1:0] core_req; + logic [NumInp-1:0] core_req_valid, core_req_ready; - tcdm_req_chan_t [NumCache+NumRemotePort-1:0] mem_req; - logic [NumCache+NumRemotePort-1:0] mem_req_valid, mem_req_ready; + tcdm_req_chan_t [NumOut-1:0] mem_req; + logic [NumOut-1:0] mem_req_valid, mem_req_ready; - tcdm_rsp_chan_t [NumCores+NumRemotePort-1:0] core_rsp; - logic [NumCores+NumRemotePort-1:0] core_rsp_valid, core_rsp_ready; + tcdm_rsp_chan_t [NumInp-1:0] core_rsp; + logic [NumInp-1:0] core_rsp_valid, core_rsp_ready; - tcdm_rsp_chan_t [NumCache+NumRemotePort-1:0] mem_rsp; - logic [NumCache+NumRemotePort-1:0] mem_rsp_valid, mem_rsp_ready; + tcdm_rsp_chan_t [NumOut-1:0] mem_rsp; + logic [NumOut-1:0] mem_rsp_valid, mem_rsp_ready; // ------------------------------------------------------------------------- // Partition control – registered to ease timing @@ -155,7 +200,7 @@ module tcdm_cache_interco #( // Private/shared classification (request side, before xbar) // ------------------------------------------------------------------------- - for (genvar inp = 0; inp < NumCores+NumRemotePort; inp++) begin : gen_is_private + for (genvar inp = 0; inp < NumInp; inp++) begin : gen_is_private assign is_private[inp] = (core_req[inp].addr >= private_start_addr_q); end @@ -164,103 +209,131 @@ module tcdm_cache_interco #( // ------------------------------------------------------------------------- reqrsp_xbar #( - .NumInp (NumCores + NumRemotePort), - .NumOut (NumCache + NumRemotePort), + .NumInp (NumInp ), + .NumOut (NumOut ), .PipeReg (1'b0 ), .ExtReqPrio (1'b0 ), .ExtRspPrio (1'b0 ), .tcdm_req_chan_t (tcdm_req_chan_t ), .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) ) i_cache_xbar ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .slv_req_i (core_req ), - .slv_rr_i ('0 ), - .slv_req_valid_i (core_req_valid ), - .slv_req_ready_o (core_req_ready ), - .slv_rsp_o (core_rsp ), - .slv_rsp_valid_o (core_rsp_valid ), - .slv_rsp_ready_i (core_rsp_ready ), - .slv_sel_i (core_req_sel ), - .slv_selected_o (/* unused */ ), - .mst_req_o (mem_req ), - .mst_rr_i ('0 ), - .mst_req_valid_o (mem_req_valid ), - .mst_req_ready_i (mem_req_ready ), - .mst_rsp_i (mem_rsp ), - .mst_rsp_valid_i (mem_rsp_valid ), - .mst_rsp_ready_o (mem_rsp_ready ), - .mst_sel_i (mem_rsp_sel ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (core_req ), + .slv_rr_i ('0 ), + .slv_req_valid_i (core_req_valid ), + .slv_req_ready_o (core_req_ready ), + .slv_rsp_o (core_rsp ), + .slv_rsp_valid_o (core_rsp_valid ), + .slv_rsp_ready_i (core_rsp_ready ), + .slv_sel_i (core_req_sel ), + .slv_selected_o (/* unused */ ), + .mst_req_o (mem_req ), + .mst_rr_i ('0 ), + .mst_req_valid_o (mem_req_valid ), + .mst_req_ready_i (mem_req_ready ), + .mst_rsp_i (mem_rsp ), + .mst_rsp_valid_i (mem_rsp_valid ), + .mst_rsp_ready_o (mem_rsp_ready ), + .mst_sel_i (mem_rsp_sel ) ); // ------------------------------------------------------------------------- // Request routing (xbar input-side selection) // ------------------------------------------------------------------------- // - // Address layout (example: offset=6, CacheBankBits=2, TileBits=2): + // Address layout (example: offset=6, CacheBankBits=2, TileBits=4 with + // LocalTileBits=2 and GroupBits=2): // - // 31 14 | 13 12 | 11 10 | 9 7 | 5 0 - // Tag | TileID | BankSel | Index | CL offset - // ^-- [offset+CacheBankBits+TileBits-1 : offset+CacheBankBits] - // ^-- [offset+CacheBankBits-1 : offset] + // 31 16 | 15 14 | 13 12 | 11 10 | 9 7 | 5 0 + // Tag | GroupID | LclTID | BankSel | Index | CL offset + // ^-- [offset+CacheBankBits+TileBits-1 : offset+CacheBankBits+LocalTileBits] + // ^-- [offset+CacheBankBits+LocalTileBits-1 : offset+CacheBankBits] + // ^-- [offset+CacheBankBits-1 : offset] // - // Partitioning supports any num_private_cache_q in [0..NumCache]: - // Private banks : ports [0 .. num_private_cache_q-1] - // Shared banks : ports [num_private_cache_q .. NumCache-1] + // Three-way routing classification: + // 1. Local : addr tile == my tile -> route to cache bank + // 2. Intra-group : same group, different tile -> route to remote port + // 3. Inter-group : different group -> route to inter-group remote port // - // Bank selection uses modulo folding: - // private_bank = (addr_bank_bits % num_private_cache_q) - // shared_bank = num_private_cache_q + (addr_bank_bits % num_shared_cache_q) + // Partitioning (private/shared) interacts as follows: + // - Private requests are always local (same as before). + // - Shared requests use the full three-way classification. // - // For power-of-2 partition sizes this reduces to a simple bit mask. - // For non-power-of-2 sizes (e.g. 3) the modulo is a small comparator since - // addr_bank_bits is only CacheBankBits wide. + // The original two-way classification (local vs. remote) is preserved + // when NumRemoteGroupPort == 0, ensuring backward compatibility. + + // Derive this tile's group ID from the globally-unique tile_id_i. + logic [TileBits-1:0] my_group_id; + if (NumRemoteGroupPort == 0) begin + assign my_group_id = tile_id_i; + end else begin + assign my_group_id = tile_id_i[TileBits-1:LocalTileBits]; + end - for (genvar port = 0; port < NumCores+NumRemotePort; port++) begin : gen_req_sel + for (genvar port = 0; port < NumInp; port++) begin : gen_req_sel logic [CacheBankBits-1:0] addr_bank; - logic [TileIDWidth-1:0] addr_tile; + // Full tile ID extracted from the address (covers group + local tile). + logic [TileBits-1:0] addr_tile_id; + // Group portion of the address tile field. + logic [TileBits-1:0] addr_group_id; + // Whether the addressed group matches this tile's group. + logic same_group; always_comb begin // Defaults. - local_sel[port] = 1'b1; core_req_sel[port] = '0; // Extract the raw BankSel field from the address. - addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits]; - // Extract the target TileID from the address (used for remote port selection). - addr_tile = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; - - if (num_private_cache_q == ($clog2(NumCache)+1)'(NumCache) || NumTiles == 1) begin - // All-private or single-tile: every request is local. + addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits]; + // Extract the full tile ID (group + local) from the address. + addr_tile_id = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; + // Extract group portion (upper bits of tile ID). + addr_group_id = addr_tile_id >> LocalTileBits; + // Compare group IDs. + same_group = (addr_group_id == my_group_id); + + if (num_private_cache_q == ($clog2(NumCache)+1)'(NumCache) + || (NumTiles == 1 && NumRemoteGroupPort == 0)) begin + // All-private, or single-tile single-group: every request is local. // Use the full BankSel field directly (no folding needed). - local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank); end else if (num_private_cache_q == '0) begin - // All-shared: check TileID to decide local vs. remote. - // Use the full BankSel field directly (no folding needed). - local_sel[port] = (addr_tile == tile_id_i); - // Route remote requests by target tile ID so that all accesses to the - // same tile share a single pipeline, preserving write-before-read - // ordering across barriers. - core_req_sel[port] = local_sel[port] - ? core_sel_t'(addr_bank) - : core_sel_t'(NumCache + (addr_tile % NumRemotePort)); + // All-shared: full three-way classification. + if (NumRemoteGroupPort > 0 && !same_group) begin + // Inter-group: route to inter-group remote port. + core_req_sel[port] = core_sel_t'(NumCache + NumRemotePort + + (port % NumRemoteGroupPort)); + end else if (addr_tile_id[LocalTileBits-1:0] != tile_id_i[LocalTileBits-1:0] + && !(NumTiles == 1)) begin + // Intra-group remote: different tile, same group. + core_req_sel[port] = core_sel_t'(NumCache + (port % NumRemotePort)); + end else begin + // Local: same tile. + core_req_sel[port] = core_sel_t'(addr_bank); + end end else begin - // Mixed: fold addr_bank into the appropriate partition via modulo. + // Mixed partition: fold addr_bank into the appropriate partition. if (is_private[port]) begin // Private request: always local. - // bank = addr_bank % num_private_cache_q, offset from bank 0. - local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank % num_private_cache_q); end else begin - // Shared request: check TileID to decide local vs. remote. - // bank = num_private_cache_q + (addr_bank % num_shared_cache_q). - local_sel[port] = (addr_tile == tile_id_i); - core_req_sel[port] = local_sel[port] - ? core_sel_t'(num_private_cache_q + (addr_bank % num_shared_cache_q)) - : core_sel_t'(NumCache + (addr_tile % NumRemotePort)); + // Shared request: three-way classification. + if (NumRemoteGroupPort > 0 && !same_group) begin + // Inter-group: route to inter-group remote port. + core_req_sel[port] = core_sel_t'(NumCache + NumRemotePort + + (port % NumRemoteGroupPort)); + end else if (addr_tile_id[LocalTileBits-1:0] != tile_id_i[LocalTileBits-1:0] + && !(NumTiles == 1)) begin + // Intra-group remote: different tile, same group. + core_req_sel[port] = core_sel_t'(NumCache + (port % NumRemotePort)); + end else begin + // Local: same tile. + core_req_sel[port] = core_sel_t'(num_private_cache_q + + (addr_bank % num_shared_cache_q)); + end end end end @@ -269,16 +342,35 @@ module tcdm_cache_interco #( // ------------------------------------------------------------------------- // Response routing (xbar output-side selection) // ------------------------------------------------------------------------- + // + // Responses from local cache banks are routed back to the originating + // core using core_id. Responses from intra-group remote tiles and + // inter-group remote ports carry a tile_id that differs from tile_id_i; + // these are forwarded to the corresponding remote-in or inter-group remote-in port. + + for (genvar port = 0; port < NumOut; port++) begin : gen_rsp_sel + logic [TileBits-1:0] rsp_group_id; + if (NumRemoteGroupPort == 0) begin + assign rsp_group_id = my_group_id; + end else begin + assign rsp_group_id = mem_rsp[port].user.tile_id[TileBits-1:LocalTileBits]; + end - for (genvar port = 0; port < NumCache+NumRemotePort; port++) begin : gen_rsp_sel always_comb begin mem_rsp_sel[port] = mem_rsp[port].user.core_id; if (mem_rsp[port].user.tile_id != tile_id_i) begin - // Response destined for a remote tile: forward to the remote interco - // port that matches the incoming request path. The group-level xbar - // routes requests from source tile S to our remote-in slot - // (S % NumRemotePort), so responses must return via the same slot. - mem_rsp_sel[port] = mem_sel_t'(NumCores + (mem_rsp[port].user.tile_id % NumRemotePort)); + // Response originates from a different tile (intra-group remote or + // inter-group remote). Determine which input port set it came from. + if (NumRemoteGroupPort > 0 + && rsp_group_id != my_group_id) begin + // Inter-group: forward to the inter-group remote-in input port. + mem_rsp_sel[port] = mem_sel_t'(NumCores + NumRemotePort + + (mem_rsp[port].user.core_id % NumRemoteGroupPort)); + end else begin + // Intra-group: forward to the remote-in input port. + mem_rsp_sel[port] = mem_sel_t'(NumCores + + (mem_rsp[port].user.core_id % NumRemotePort)); + end end end end @@ -287,7 +379,7 @@ module tcdm_cache_interco #( // Input-side pipeline registers // ------------------------------------------------------------------------- - for (genvar port = 0; port < NumCores+NumRemotePort; port++) begin : gen_cache_interco_reg + for (genvar port = 0; port < NumInp; port++) begin : gen_cache_interco_reg spill_register #( .T (tcdm_req_chan_t ) ) i_tcdm_req_reg ( @@ -349,11 +441,11 @@ module tcdm_cache_interco #( // // lower = addr & ((1 << offset) - 1) // CLoffset, verbatim // rot_field = (addr >> offset) & ((1 << N) - 1) // N routing bits - // upper = addr >> (offset + N) // Tag+Index + // upper = addr >> (offset + N) // Tag+Index // // addr_rot = lower - // | (upper << offset) // close the hole - // | (rot_field << (AddrWidth - N)) // park at MSB + // | (upper << offset) // close the hole + // | (rot_field << (AddrWidth - N)) // park at MSB // Width of bits_to_rotate signal: must hold values up to CacheBankBits+TileBits. localparam int unsigned RotWidth = $clog2(CacheBankBits + TileBits + 1) + 1; @@ -408,7 +500,7 @@ module tcdm_cache_interco #( // Output assignment // ------------------------------------------------------------------------- - for (genvar port = 0; port < NumCache + NumRemotePort; port++) begin : gen_cache_io + for (genvar port = 0; port < NumOut; port++) begin : gen_cache_io always_comb begin mem_req_o[port] = '{ q : mem_req[port], @@ -419,10 +511,14 @@ module tcdm_cache_interco #( if (port < NumCache) begin // Local bank: forward address with routing bits rotated to MSB. mem_req_o[port].q.addr = addr_rot[port]; - end else begin - // Remote port: pass address untouched; extract target tile ID. + end else if (port < NumCache + NumRemotePort) begin + // Intra-group remote port: pass address untouched; extract target tile ID. tile_sel_o[port - NumCache] = - mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; + mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; + end else begin + // Inter-group remote port: pass address untouched; extract target tile ID. + remote_group_sel_o[port - NumCache - NumRemotePort] = + mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; end end @@ -433,4 +529,21 @@ module tcdm_cache_interco #( assign mem_rsp_ready_o = mem_rsp_ready; + // ------------------------------------------------------------------------- + // Assertions + // ------------------------------------------------------------------------- +`ifndef TARGET_SYNTHESIS + // This is used to ensure we will not have illegal visits to DRAM + // This kind of error can be latent in the system until the entry is evicted + for (genvar x = 0; x < TotInPorts; x++) begin : gen_addr_assert + CoreReqAddrAboveDram: assert property ( + @(posedge clk_i) disable iff (!rst_ni !== '0) + core_req_i[x].q_valid |-> core_req_i[x].q.addr >= addr_t'(DramBaseAddr) + ) else begin + $error("[%m] port %0d: addr 0x%08x is below DramBaseAddr 0x%08x", + x, core_req_i[x].q.addr, DramBaseAddr); + end + end +`endif + endmodule diff --git a/hardware/tb/cachepool_cluster_wrapper.sv b/hardware/tb/cachepool_cluster_wrapper.sv index a9dba20..e412f7c 100644 --- a/hardware/tb/cachepool_cluster_wrapper.sv +++ b/hardware/tb/cachepool_cluster_wrapper.sv @@ -13,50 +13,65 @@ module cachepool_cluster_wrapper import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; #( - parameter int unsigned AxiAddrWidth = SpatzAxiAddrWidth, - parameter int unsigned AxiDataWidth = SpatzAxiDataWidth, - parameter int unsigned AxiUserWidth = SpatzAxiUserWidth, - parameter int unsigned AxiInIdWidth = SpatzAxiIdInWidth, - parameter int unsigned AxiOutIdWidth = SpatzAxiIdOutWidth, - - parameter type axi_in_resp_t = spatz_axi_in_resp_t, - parameter type axi_in_req_t = spatz_axi_in_req_t, - - parameter type axi_out_resp_t = spatz_axi_out_resp_t, - parameter type axi_out_req_t = spatz_axi_out_req_t, - - parameter type axi_narrow_req_t = spatz_axi_narrow_req_t, - parameter type axi_narrow_resp_t = spatz_axi_narrow_resp_t + parameter int unsigned AxiAddrWidth = SpatzAxiAddrWidth, + parameter int unsigned AxiDataWidth = SpatzAxiDataWidth, + parameter int unsigned AxiUserWidth = SpatzAxiUserWidth, + // External input ID width (SoC/testbench → wrapper); remapped to SpatzAxiIdInWidth inside. + parameter int unsigned AxiInIdWidth = WrapperAxiIdInWidth, + // External wide output ID width (wrapper → DRAM); remapped from SpatzAxiIdOutWidth inside. + parameter int unsigned AxiOutIdWidth = WrapperAxiIdOutWidth, + // External narrow output ID width (UART, wrapper → SoC); remapped from SpatzAxiUartIdWidth inside. + parameter int unsigned AxiNarrowOutIdWidth = WrapperAxiNarrowIdOutWidth, + + // External input types use the wrapper-narrowed ID (WrapperAxiIdInWidth). + parameter type axi_in_req_t = spatz_axi_wrapper_in_req_t, + parameter type axi_in_resp_t = spatz_axi_wrapper_in_resp_t, + + // External wide output types use the wrapper-narrowed ID (WrapperAxiIdOutWidth). + parameter type axi_out_req_t = spatz_axi_wrapper_out_req_t, + parameter type axi_out_resp_t = spatz_axi_wrapper_out_resp_t, + + // External narrow output types use the wrapper-narrowed ID (WrapperAxiNarrowIdOutWidth). + parameter type axi_narrow_out_req_t = spatz_axi_wrapper_narrow_out_req_t, + parameter type axi_narrow_out_resp_t = spatz_axi_wrapper_narrow_out_resp_t )( - input logic clk_i, - input logic rst_ni, - output logic [3:0] eoc_o, - input logic debug_req_i, - - input logic meip_i, - input logic mtip_i, - input logic msip_i, - output logic cluster_probe_o, - input axi_in_req_t axi_in_req_i, - output axi_in_resp_t axi_in_resp_o, - /// AXI Narrow out-port (UART) - output axi_uart_req_t axi_narrow_req_o, - input axi_uart_resp_t axi_narrow_resp_i, - output axi_out_req_t [NumClusterSlv-1:0] axi_out_req_o, - input axi_out_resp_t [NumClusterSlv-1:0] axi_out_resp_i + input logic clk_i, + input logic rst_ni, + output logic [3:0] eoc_o, + input logic debug_req_i, + + input logic meip_i, + input logic mtip_i, + input logic msip_i, + output logic cluster_probe_o, + // AXI slave port (from SoC/testbench); external ID = AxiInIdWidth. + input axi_in_req_t axi_in_req_i, + output axi_in_resp_t axi_in_resp_o, + /// AXI Narrow out-port (UART); external ID = AxiNarrowOutIdWidth. + output axi_narrow_out_req_t axi_narrow_req_o, + input axi_narrow_out_resp_t axi_narrow_resp_i, + // AXI wide master ports (to DRAM); external ID = AxiOutIdWidth. + output axi_out_req_t [NumClusterSlv-1:0] axi_out_req_o, + input axi_out_resp_t [NumClusterSlv-1:0] axi_out_resp_i ); - - spatz_axi_iwc_out_req_t [NumClusterSlv-1:0] axi_from_cluster_iwc_req; - spatz_axi_iwc_out_resp_t [NumClusterSlv-1:0] axi_from_cluster_iwc_resp; + // Internal signals between wrapper remappers and cluster (fat IDs). + spatz_axi_in_req_t axi_cluster_in_req; + spatz_axi_in_resp_t axi_cluster_in_resp; + axi_uart_req_t axi_cluster_narrow_req; + axi_uart_resp_t axi_cluster_narrow_resp; + spatz_axi_out_req_t [NumClusterSlv-1:0] axi_cluster_out_req; + spatz_axi_out_resp_t [NumClusterSlv-1:0] axi_cluster_out_resp; // Spatz cluster under test. + // Internal AXI types are fixed (full-width IDs); the wrapper remaps at both boundaries. cachepool_cluster #( .AxiAddrWidth (AxiAddrWidth ), .AxiDataWidth (AxiDataWidth ), - .AxiIdWidthIn (AxiInIdWidth ), - .AxiIdWidthOut (AxiOutIdWidth ), + // Cluster always sees the full internal ID width on its slave port. + .AxiIdWidthIn (SpatzAxiIdInWidth ), + .AxiIdWidthOut (SpatzAxiIdOutWidth ), .AxiUserWidth (AxiUserWidth ), .BootAddr (BootAddr ), .UartAddr (UartAddr ), @@ -74,15 +89,15 @@ module cachepool_cluster_wrapper .NumIntOutstandingLoads (NumIntOutstandingLoads ), .NumIntOutstandingMem (NumIntOutstandingMem ), .NumSpatzOutstandingLoads (NumSpatzOutstandingLoads ), - .axi_in_req_t (axi_in_req_t ), - .axi_in_resp_t (axi_in_resp_t ), - .axi_narrow_req_t (axi_narrow_req_t ), - .axi_narrow_resp_t (axi_narrow_resp_t ), - .axi_out_req_t (axi_out_req_t ), - .axi_out_resp_t (axi_out_resp_t ), - .Xdma (4'h0 ), - .DMAAxiReqFifoDepth (3 ), - .DMAReqFifoDepth (3 ), + // Cluster slave port uses full internal type (remap is above this level). + .axi_in_req_t (spatz_axi_in_req_t ), + .axi_in_resp_t (spatz_axi_in_resp_t ), + // Cluster per-tile narrow type (internal crossbar width, not the UART mux output). + .axi_narrow_req_t (spatz_axi_narrow_req_t ), + .axi_narrow_resp_t (spatz_axi_narrow_resp_t ), + // Cluster internally uses the fat output type; the wrapper remaps it. + .axi_out_req_t (spatz_axi_out_req_t ), + .axi_out_resp_t (spatz_axi_out_resp_t ), .RegisterOffloadRsp (1 ), .RegisterCoreReq (1 ), .RegisterCoreRsp (1 ), @@ -97,22 +112,86 @@ module cachepool_cluster_wrapper .eoc_o (eoc_o ), .impl_i ('0 ), .error_o ( ), - .debug_req_i ({NumCores{debug_req_i}} ), - .meip_i ({NumCores{meip_i}} ), - .mtip_i ({NumCores{mtip_i}} ), - .msip_i ({NumCores{msip_i}} ), + .debug_req_i (debug_req_i ), + .meip_i (meip_i ), + .mtip_i (mtip_i ), + .msip_i (msip_i ), .hart_base_id_i (10'h0 ), .cluster_base_addr_i (TCDMStartAddr ), .cluster_probe_o (cluster_probe_o ), - .axi_in_req_i , - .axi_in_resp_o , - .axi_narrow_req_o , - .axi_narrow_resp_i , - // AXI Master Port - .axi_out_req_o ( axi_out_req_o ), - .axi_out_resp_i ( axi_out_resp_i ) + // Remapped internal connections. + .axi_in_req_i (axi_cluster_in_req ), + .axi_in_resp_o (axi_cluster_in_resp ), + .axi_narrow_req_o (axi_cluster_narrow_req ), + .axi_narrow_resp_i (axi_cluster_narrow_resp ), + // AXI Master Port (fat IDs; wrapper remaps before external port). + .axi_out_req_o (axi_cluster_out_req ), + .axi_out_resp_i (axi_cluster_out_resp ) + ); + + // Expand WrapperAxiIdInWidth -> SpatzAxiIdInWidth on the cluster slave port. + // The external SoC/testbench drives narrow IDs; the cluster expects full-width IDs. + axi_id_remap #( + .AxiSlvPortIdWidth ( WrapperAxiIdInWidth ), + // Up to 2^WrapperAxiIdInWidth = 16 unique IDs from external host. + .AxiSlvPortMaxUniqIds ( 2**WrapperAxiIdInWidth ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( SpatzAxiIdInWidth ), + .slv_req_t ( axi_in_req_t ), + .slv_resp_t ( axi_in_resp_t ), + .mst_req_t ( spatz_axi_in_req_t ), + .mst_resp_t ( spatz_axi_in_resp_t ) + ) i_in_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_in_req_i ), + .slv_resp_o ( axi_in_resp_o ), + .mst_req_o ( axi_cluster_in_req ), + .mst_resp_i ( axi_cluster_in_resp ) ); + // Compress SpatzAxiUartIdWidth -> WrapperAxiNarrowIdOutWidth on the UART master port. + axi_id_remap #( + .AxiSlvPortIdWidth ( SpatzAxiUartIdWidth ), + // Cap at 2^WrapperAxiNarrowIdOutWidth unique IDs toward the SoC. + .AxiSlvPortMaxUniqIds ( 2**WrapperAxiNarrowIdOutWidth ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( WrapperAxiNarrowIdOutWidth ), + .slv_req_t ( axi_uart_req_t ), + .slv_resp_t ( axi_uart_resp_t ), + .mst_req_t ( axi_narrow_out_req_t ), + .mst_resp_t ( axi_narrow_out_resp_t ) + ) i_narrow_out_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_cluster_narrow_req ), + .slv_resp_o ( axi_cluster_narrow_resp ), + .mst_req_o ( axi_narrow_req_o ), + .mst_resp_i ( axi_narrow_resp_i ) + ); + + // Reduce SpatzAxiIdOutWidth -> WrapperAxiIdOutWidth per DRAM channel. + // NumAxiMaxTrans = 32 outstanding per channel; 6 bits gives 64 unique ID slots. + for (genvar ch = 0; ch < NumClusterSlv; ch++) begin : gen_out_id_remap + axi_id_remap #( + .AxiSlvPortIdWidth ( SpatzAxiIdOutWidth ), + .AxiSlvPortMaxUniqIds ( NumAxiMaxTrans ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( WrapperAxiIdOutWidth ), + .slv_req_t ( spatz_axi_out_req_t ), + .slv_resp_t ( spatz_axi_out_resp_t ), + .mst_req_t ( spatz_axi_wrapper_out_req_t ), + .mst_resp_t ( spatz_axi_wrapper_out_resp_t ) + ) i_out_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_cluster_out_req [ch] ), + .slv_resp_o ( axi_cluster_out_resp [ch] ), + .mst_req_o ( axi_out_req_o [ch] ), + .mst_resp_i ( axi_out_resp_i [ch] ) + ); + end + // AXI utilization monitor `ifndef TARGET_SYNTHESIS typedef logic [31:0] cnt_t; @@ -241,11 +320,14 @@ module cachepool_cluster_wrapper if (AxiUserWidth != SpatzAxiUserWidth) $error("[spatz_cluster_wrapper] AXI User Width does not match the configuration."); - if (AxiInIdWidth != SpatzAxiIdInWidth) + if (AxiInIdWidth != WrapperAxiIdInWidth) $error("[spatz_cluster_wrapper] AXI Id Width (In) does not match the configuration."); - if (AxiOutIdWidth != SpatzAxiIdOutWidth) + if (AxiOutIdWidth != WrapperAxiIdOutWidth) $error("[spatz_cluster_wrapper] AXI Id Width (Out) does not match the configuration."); + + if (AxiNarrowOutIdWidth != WrapperAxiNarrowIdOutWidth) + $error("[spatz_cluster_wrapper] AXI Narrow Id Width (Out) does not match the configuration."); `endif endmodule diff --git a/hardware/tb/tb_cachepool.sv b/hardware/tb/tb_cachepool.sv index 7da3e78..385d699 100644 --- a/hardware/tb/tb_cachepool.sv +++ b/hardware/tb/tb_cachepool.sv @@ -66,18 +66,19 @@ module tb_cachepool; localparam NumAXISlaves = 2; localparam NumRules = NumAXISlaves-1; - // Spatz wide port to SoC (currently dram) - spatz_axi_out_req_t [NumL2Channel-1:0] axi_from_cluster_req; - spatz_axi_out_resp_t [NumL2Channel-1:0] axi_from_cluster_resp; - // From SoC to Spatz - spatz_axi_in_req_t axi_to_cluster_req; - spatz_axi_in_resp_t axi_to_cluster_resp; + // Spatz wide port to SoC (currently dram); IDs narrowed by wrapper-level axi_id_remap. + spatz_axi_wrapper_out_req_t [NumL2Channel-1:0] axi_from_cluster_req; + spatz_axi_wrapper_out_resp_t [NumL2Channel-1:0] axi_from_cluster_resp; + // From SoC to Spatz; IDs expanded by wrapper-level axi_id_remap (WrapperAxiIdInWidth → SpatzAxiIdInWidth). + spatz_axi_wrapper_in_req_t axi_to_cluster_req; + spatz_axi_wrapper_in_resp_t axi_to_cluster_resp; - axi_uart_req_t axi_uart_req; - axi_uart_resp_t axi_uart_rsp; + // UART; IDs compressed by wrapper-level axi_id_remap (SpatzAxiUartIdWidth → WrapperAxiNarrowIdOutWidth). + spatz_axi_wrapper_narrow_out_req_t axi_uart_req; + spatz_axi_wrapper_narrow_out_resp_t axi_uart_rsp; // DRAM Scrambled request - spatz_axi_out_req_t [NumL2Channel-1:0] axi_dram_req; + spatz_axi_wrapper_out_req_t [NumL2Channel-1:0] axi_dram_req; /********* @@ -141,13 +142,13 @@ module tb_cachepool; reqrsp_cluster_in_rsp_t to_cluster_rsp; reqrsp_to_axi #( - .DataWidth (SpatzDataWidth ), - .AxiUserWidth(SpatzAxiUserWidth ), - .UserWidth ($bits(tcdm_user_t) ), - .axi_req_t (spatz_axi_in_req_t ), - .axi_rsp_t (spatz_axi_in_resp_t ), - .reqrsp_req_t(reqrsp_cluster_in_req_t), - .reqrsp_rsp_t(reqrsp_cluster_in_rsp_t) + .DataWidth (SpatzDataWidth ), + .AxiUserWidth(SpatzAxiUserWidth ), + .UserWidth ($bits(tcdm_user_t) ), + .axi_req_t (spatz_axi_wrapper_in_req_t ), + .axi_rsp_t (spatz_axi_wrapper_in_resp_t ), + .reqrsp_req_t(reqrsp_cluster_in_req_t ), + .reqrsp_rsp_t(reqrsp_cluster_in_rsp_t ) ) i_reqrsp_to_axi ( .clk_i (clk ), .rst_ni (rst_n ), @@ -211,6 +212,72 @@ module tb_cachepool; to_cluster_req = '0; + // Initialize L1D cache before waking up cores + // Step 1: Write init instruction (flush + invalidate) + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_CFG_L1D_INSN_OFFSET, + data : 32'h3, + write : 1'b1, + strb : '1, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + + // Step 2: Commit the instruction + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_L1D_INSN_COMMIT_OFFSET, + data : 32'h1, + write : 1'b1, + strb : '1, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + + // Step 3: Poll until flush complete + begin + automatic logic [31:0] flush_status; + do begin + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_L1D_FLUSH_STATUS_OFFSET, + write : 1'b0, + strb : '0, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + flush_status = to_cluster_rsp.p.data; + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + end while (flush_status[0]); + end + // Wake up cores debug_req = '1; @(posedge clk); @@ -227,8 +294,8 @@ module tb_cachepool; **********/ axi_uart #( - .axi_req_t (axi_uart_req_t ), - .axi_resp_t(axi_uart_resp_t) + .axi_req_t (spatz_axi_wrapper_narrow_out_req_t ), + .axi_resp_t(spatz_axi_wrapper_narrow_out_resp_t) ) i_axi_uart ( .clk_i (clk ), .rst_ni (rst_n ), @@ -314,19 +381,19 @@ module tb_cachepool; for (genvar mem = 0; mem < NumL2Channel; mem++) begin: gen_dram axi_dram_sim #( - .BASE ( DramBase ), - .DRAMType ( DramType ), - .AxiAddrWidth ( SpatzAxiAddrWidth ), - .AxiDataWidth ( SpatzAxiDataWidth ), - .AxiIdWidth ( SpatzAxiIdOutWidth ), - .AxiUserWidth ( SpatzAxiUserWidth ), - .axi_req_t ( spatz_axi_out_req_t ), - .axi_resp_t ( spatz_axi_out_resp_t ), - .axi_ar_t ( spatz_axi_out_ar_chan_t ), - .axi_r_t ( spatz_axi_out_r_chan_t ), - .axi_aw_t ( spatz_axi_out_aw_chan_t ), - .axi_w_t ( spatz_axi_out_w_chan_t ), - .axi_b_t ( spatz_axi_out_b_chan_t ) + .BASE ( DramBase ), + .DRAMType ( DramType ), + .AxiAddrWidth ( SpatzAxiAddrWidth ), + .AxiDataWidth ( SpatzAxiDataWidth ), + .AxiIdWidth ( WrapperAxiIdOutWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .axi_req_t ( spatz_axi_wrapper_out_req_t ), + .axi_resp_t ( spatz_axi_wrapper_out_resp_t ), + .axi_ar_t ( spatz_axi_wrapper_out_ar_chan_t ), + .axi_r_t ( spatz_axi_wrapper_out_r_chan_t ), + .axi_aw_t ( spatz_axi_wrapper_out_aw_chan_t ), + .axi_w_t ( spatz_axi_wrapper_out_w_chan_t ), + .axi_b_t ( spatz_axi_wrapper_out_b_chan_t ) ) i_axi_dram_sim ( .clk_i ( clk ), .rst_ni ( rst_n ), diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..216cd0d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +# Python packages required for hardware code generation (make generate). +# dataclasses is a stdlib backport needed on Python 3.6; it is a no-op on 3.7+. +dataclasses +hjson +jsonref +jsonschema +mako +termcolor diff --git a/sim/scripts/vsim_cluster.tcl b/sim/scripts/vsim_cluster.tcl index c8d66f3..2213a16 100644 --- a/sim/scripts/vsim_cluster.tcl +++ b/sim/scripts/vsim_cluster.tcl @@ -5,11 +5,7 @@ # Create group for Cluster onerror {resume} -set cluster_path $1 - -add wave -noupdate -group Cluster -group xbar -group req_xbar ${cluster_path}/i_cluster_xbar/i_req_xbar/* -add wave -noupdate -group Cluster -group xbar -group rsp_xbar ${cluster_path}/i_cluster_xbar/i_rsp_xbar/* -add wave -noupdate -group Cluster -group xbar ${cluster_path}/i_cluster_xbar/* +quietly set cluster_path $1 add wave -noupdate -group Cluster -group CSR ${cluster_path}/i_cachepool_cluster_peripheral/* diff --git a/sim/scripts/vsim_core.tcl b/sim/scripts/vsim_core.tcl index 9510e33..30ee61a 100644 --- a/sim/scripts/vsim_core.tcl +++ b/sim/scripts/vsim_core.tcl @@ -4,179 +4,192 @@ # Create group for core $1 onerror {resume} - -set core_path ${3} - -add wave -noupdate -group tile[$1]_core[$2] -group scalar_xbar ${core_path}/i_cachepool_cc/i_scalar_xbar/* - -add wave -noupdate -group tile[$1]_core[$2] -group Params ${core_path}/i_cachepool_cc/BootAddr -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/clk_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/rst_i -add wave -noupdate -group tile[$1]_core[$2] -radix unsigned ${core_path}/i_cachepool_cc/i_snitch/hart_id_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Instructions -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_addr_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_data_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_valid_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_ready_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Load/Store -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/data_req_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/data_rsp_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Accelerator -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qreq_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qrsp_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qvalid_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qready_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_prsp_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_pvalid_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_pready_o - -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/illegal_inst -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/zero_lsb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -divider LSU -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_size -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_amo -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qvalid -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pvalid -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_load -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_i -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_acc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -divider ALU -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/iimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/jimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/bimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/simm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/adder_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs1 -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs2 -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_raddr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_rdata -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_waddr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_wdata -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_we -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/consec_pc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_load -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_store -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_signed -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_addr_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/st_addr_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/valid_instr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/exception -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_op -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/write_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uses_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/next_pc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_bypass -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_branch -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_rvalue -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_en -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_register_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/operands_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/dst_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_reversed -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_ext -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result_ext -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_arithmetic -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_writeback -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/core_events_o - -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -group Internal -group RF ${core_path}/i_cachepool_cc/i_snitch/i_snitch_regfile/* -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -group Internal ${core_path}/i_cachepool_cc/i_snitch/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_valid_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_ready_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_req_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_rsp_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_valid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_ready_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_valid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_ready_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_valid_i - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" -group FPR ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fpr/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" -group LSU ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fp_lsu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group Controller ${core_path}/i_cachepool_cc/i_spatz/i_controller/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider RegisterWrite -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wvalid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider RegisterRead -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/re_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rvalid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider Internal -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VSLDU ${core_path}/i_cachepool_cc/i_spatz/i_vsldu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VFU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group FPU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/gen_fpu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Internal ${core_path}/i_cachepool_cc/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU -group ROB0 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[0]/i_reorder_buffer/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU -group ROB1 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[1]/i_reorder_buffer/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_valid -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_ready -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_empty -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_pop -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_push -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo_bypass +quietly WaveActivateNextPane {} 0 + +quietly set core_path ${4} +quietly set name g_${1}_t_${2}_c_${3} + +# Build the parent group prefix list from optional args 5 (GroupWP) and 6 (tile) +quietly set parent_grp [list] +if {$argc > 4 && "${5}" != ""} { + quietly lappend parent_grp -group ${5} +} +if {$argc > 5 && "${6}" != ""} { + quietly lappend parent_grp -group ${6} +} + +# The {*} syntax safely expands the list. +# If $parent_grp is empty, it safely ignores it instead of passing "". +add wave -noupdate {*}$parent_grp -group ${name} -group scalar_xbar ${core_path}/i_cachepool_cc/i_scalar_xbar/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Params ${core_path}/i_cachepool_cc/BootAddr +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/clk_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/rst_i +add wave -noupdate {*}$parent_grp -group ${name} -radix unsigned ${core_path}/i_cachepool_cc/i_snitch/hart_id_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Instructions +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_addr_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_data_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_valid_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_ready_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Load/Store +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/data_req_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/data_rsp_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Accelerator +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qreq_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qrsp_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qvalid_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qready_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_prsp_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_pvalid_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_pready_o + +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/illegal_inst +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/zero_lsb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -divider LSU +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_size +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_amo +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qvalid +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pvalid +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_load +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_i +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_acc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -divider ALU +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/iimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/jimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/bimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/simm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/adder_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs1 +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs2 +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_raddr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_rdata +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_waddr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_wdata +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_we +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/consec_pc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_load +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_store +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_signed +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_addr_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/st_addr_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/valid_instr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/exception +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_op +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/write_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uses_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/next_pc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_bypass +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_branch +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_rvalue +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_en +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_register_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/operands_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/dst_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_reversed +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_ext +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result_ext +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_arithmetic +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_writeback +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/core_events_o + +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -group Internal -group RF ${core_path}/i_cachepool_cc/i_snitch/i_snitch_regfile/* +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -group Internal ${core_path}/i_cachepool_cc/i_snitch/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_valid_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_ready_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_req_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_rsp_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_valid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_ready_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_valid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_ready_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_valid_i + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" -group FPR ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fpr/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" -group LSU ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fp_lsu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group Controller ${core_path}/i_cachepool_cc/i_spatz/i_controller/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider RegisterWrite +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wvalid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider RegisterRead +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/re_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rvalid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider Internal +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VSLDU ${core_path}/i_cachepool_cc/i_spatz/i_vsldu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VFU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group FPU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/gen_fpu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Internal ${core_path}/i_cachepool_cc/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU -group ROB0 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[0]/i_reorder_buffer/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU -group ROB1 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[1]/i_reorder_buffer/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_valid +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_empty +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_pop +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_push +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo_bypass diff --git a/sim/scripts/vsim_group.tcl b/sim/scripts/vsim_group.tcl index 8edb7e5..876c4b2 100644 --- a/sim/scripts/vsim_group.tcl +++ b/sim/scripts/vsim_group.tcl @@ -2,18 +2,21 @@ # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 -# Create group for Tile $1 onerror {resume} -set group_path $1 +quietly set group_path $1 +quietly set parent_grp $3 # Add waves for remote xbar for {set p 0} {$p < $2} {incr p} { onerror {resume} - set xbar_path ${group_path}/gen_remote_tile_xbar[$p]/i_tile_remote_xbar + quietly set xbar_path ${group_path}/gen_remote_tile_xbar[$p]/i_tile_remote_xbar - add wave -noupdate -group Group -group remote_xbar[$p] ${xbar_path}/* + add wave -noupdate -group "${parent_grp}" -group remote_xbar[$p] ${xbar_path}/* } -add wave -noupdate -group Group -group Internal ${group_path}/* +add wave -noupdate -group "${parent_grp}" -group refill_xbar -group req_xbar ${group_path}/i_refill_xbar/i_req_xbar/* +add wave -noupdate -group "${parent_grp}" -group refill_xbar -group rsp_xbar ${group_path}/i_refill_xbar/i_rsp_xbar/* + +add wave -noupdate -group "${parent_grp}" -group Internal ${group_path}/* diff --git a/sim/scripts/vsim_tile.tcl b/sim/scripts/vsim_tile.tcl index 8763440..5a9565e 100644 --- a/sim/scripts/vsim_tile.tcl +++ b/sim/scripts/vsim_tile.tcl @@ -5,38 +5,44 @@ # Create group for Tile $1 onerror {resume} -set tile_path $2 +quietly set tile_path $3 +quietly set parent_grp $4 + +# --- Configuration Variables --- +# NrTCDMPortsPerCore: 4 Spatz ports + 1 Snitch port +quietly set NUM_XBARS 5 +quietly set SNITCH_IDX [expr {$NUM_XBARS - 1}] # Add waves for tcdm_mapper and csrs -# add wave -noupdate -group tile[$1] -group Barrier ${tile_path}/i_tile/i_snitch_barrier/* -# add wave -noupdate -group tile[$1] -group axi2reqrsp ${tile_path}/i_axi2reqrsp/* +# add wave -noupdate -group ${parent_grp} -group tile[$1] -group Barrier ${tile_path}/i_tile/i_snitch_barrier/* +# add wave -noupdate -group ${parent_grp} -group tile[$1] -group axi2reqrsp ${tile_path}/i_axi2reqrsp/* # Add waves for xbars -add wave -noupdate -group tile[$1] -group narrow_xbar ${tile_path}/i_tile/i_axi_narrow_xbar/* -add wave -noupdate -group tile[$1] -group wide_xbar ${tile_path}/i_tile/i_axi_wide_xbar/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group narrow_xbar ${tile_path}/i_tile/i_axi_narrow_xbar/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group wide_xbar ${tile_path}/i_tile/i_axi_wide_xbar/* -add wave -noupdate -group Barrier -group tile[$1] ${tile_path}/i_tile/i_cachepool_tile_barrier/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group Barrier ${tile_path}/i_tile/i_cachepool_tile_barrier/* # Add waves for cache controller for {set c 0} {$c < 4} {incr c} { onerror {resume} - set cache_path ${tile_path}/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller + quietly set cache_path ${tile_path}/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller - add wave -noupdate -group tile[$1] -group cache[$c] -group amo ${tile_path}/i_tile/gen_cache_connect[$c]/gen_cache_amo_connect[4]/gen_amo/i_cache_amo/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group amo ${tile_path}/i_tile/gen_cache_connect[$c]/gen_cache_amo_connect[${SNITCH_IDX}]/gen_amo/i_cache_amo/* - add wave -noupdate -group tile[$1] -group cache[$c] -group coalescer ${cache_path}/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* - add wave -noupdate -group tile[$1] -group cache[$c] -group core ${cache_path}/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl0 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[0]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl1 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[1]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl2 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[2]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl3 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[3]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group coalescer ${cache_path}/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group core ${cache_path}/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl0 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[0]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl1 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[1]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl2 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[2]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl3 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[3]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group Internal ${cache_path}/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group Internal ${cache_path}/* } -for {set c 0} {$c < 5} {incr c} { - add wave -noupdate -group tile[$1] -group cache_xbar -group xbar[$c] ${tile_path}/i_tile/gen_cache_xbar[$c]/i_cache_xbar/* +for {set c 0} {$c < $NUM_XBARS} {incr c} { + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache_xbar -group xbar[$c] ${tile_path}/i_tile/gen_cache_xbar[$c]/gen_remote_group_slice/i_cache_xbar/* } # Add waves for remaining signals -add wave -noupdate -group tile[$1] -group Internal ${tile_path}/i_tile/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group Internal ${tile_path}/i_tile/* diff --git a/sim/scripts/vsim_wave.tcl b/sim/scripts/vsim_wave.tcl index d5fa528..34d35f3 100644 --- a/sim/scripts/vsim_wave.tcl +++ b/sim/scripts/vsim_wave.tcl @@ -1,12 +1,16 @@ -# Copyright 2021 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 onerror {resume} quietly WaveActivateNextPane {} 0 -set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster -set group_path ${cluster_path}/gen_group/i_group +# --- Configuration Variables --- +quietly set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster +quietly set NUM_GROUPS 4 ;# Total number of groups +quietly set NUM_GROUPS_X 2 ;# X dimension of group mesh (NUM_GROUPS_Y = NUM_GROUPS / NUM_GROUPS_X) +quietly set NUM_TILES 4 ;# Tiles per group +quietly set NUM_CORES 4 ;# Cores per tile # Add the cluster probe add wave /tb_cachepool/cluster_probe @@ -14,23 +18,38 @@ add wave /tb_cachepool/cluster_probe # Cluster do sim/scripts/vsim_cluster.tcl ${cluster_path} -# Group -# add wave -noupdate -group Group ${group_path}/* -do sim/scripts/vsim_group.tcl ${group_path} 5 - -# Tile and Core -for {set tile 0} {$tile < 4} {incr tile} { - set tile_path ${group_path}/gen_tiles[$tile] - - do sim/scripts/vsim_tile.tcl $tile ${tile_path} - # Add all cores in Tile 0 - for {set core 0} {$core < 4} {incr core} { - set core_path ${tile_path}/i_tile/gen_core[$core] - do sim/scripts/vsim_core.tcl $tile $core ${core_path} - } - - for {set ch 0} {$ch < 4} {incr ch} { - add wave -noupdate -group DramSys$ch /tb_cachepool/gen_dram[$ch]/i_axi_dram_sim/* +# Iterate through all groups using 2D coordinates +for {set g 0} {$g < $NUM_GROUPS} {incr g} { + quietly set gy [expr {$g % $NUM_GROUPS_X}] + quietly set gx [expr {$g / $NUM_GROUPS_X}] + quietly set group_wp_path ${cluster_path}/gen_group_y[${gy}]/gen_group_x[${gx}]/i_group + quietly set group_path ${group_wp_path}/i_group + quietly set gwp_name "GroupWP_Y${gy}_X${gx}" + + # 1. Plot GroupWP signals for this group (always, all groups) + add wave -noupdate -group "${gwp_name}" ${group_wp_path}/* + + + # 2. Plot Group-level signals nested inside GroupWP (always, all groups) + do sim/scripts/vsim_group.tcl ${group_path} 5 "${gwp_name}" + + # 3. Plot all tiles and cores for the diagonal groups: (0,0) always, + # and (1,1) if the mesh has at least 2 columns and 2 rows + if {($gx == 0 && $gy == 0) || ($gx == 0 && $gy == 1 && $NUM_GROUPS_X >= 2)} { + for {set tile 0} {$tile < $NUM_TILES} {incr tile} { + quietly set tile_path ${group_path}/gen_tiles[${tile}]/gen_tile + do sim/scripts/vsim_tile.tcl $tile $g ${tile_path} "${gwp_name}" + + # 4. Plot all cores grouped under their tile + for {set core 0} {$core < $NUM_CORES} {incr core} { + quietly set core_path ${tile_path}/i_tile/gen_core[${core}] + do sim/scripts/vsim_core.tcl $g $tile $core ${core_path} "${gwp_name}" "tile[${tile}]" + } + } } } +# Add DRAM waves once at the end +for {set ch 0} {$ch < 4} {incr ch} { + add wave -noupdate -group "DramSys_$ch" /tb_cachepool/gen_dram[$ch]/i_axi_dram_sim/* +} diff --git a/sim/scripts/vsim_wave_single_tile.tcl b/sim/scripts/vsim_wave_single_tile.tcl index 28e54e5..125d849 100644 --- a/sim/scripts/vsim_wave_single_tile.tcl +++ b/sim/scripts/vsim_wave_single_tile.tcl @@ -5,9 +5,10 @@ onerror {resume} quietly WaveActivateNextPane {} 0 -set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster -set group_path ${cluster_path} -set tile_path ${group_path}/gen_tile +quietly set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster +quietly set group_wp_path ${cluster_path}/gen_group[0][0]/i_group +quietly set group_path ${group_wp_path}/i_group +quietly set tile_path ${group_path}/gen_tiles[0]/gen_tile # Add the cluster probe @@ -15,11 +16,11 @@ add wave /tb_cachepool/cluster_probe do sim/scripts/vsim_cluster.tcl ${cluster_path} -do sim/scripts/vsim_tile.tcl 0 ${tile_path} +do sim/scripts/vsim_tile.tcl 0 0 ${tile_path} # Add all cores in Tile 0 for {set core 0} {$core < 4} {incr core} { - set core_path ${tile_path}/i_tile/gen_core[$core] - do sim/scripts/vsim_core.tcl 0 $core ${core_path} + quietly set core_path ${tile_path}/i_tile/gen_core[$core] + do sim/scripts/vsim_core.tcl 0 0 $core ${core_path} "" } for {set ch 0} {$ch < 4} {incr ch} { diff --git a/sim/sim.mk b/sim/sim.mk index 56cee77..80342e8 100644 --- a/sim/sim.mk +++ b/sim/sim.mk @@ -82,7 +82,7 @@ ${SIM_DIR}/${DPI_LIB}/cachepool_dpi.so: ${dpi_target} # ----------------- ${WORK_DIR}/${FESVR_VERSION}_unzip: mkdir -p $(dir $@) - wget -O $(dir $@)/${FESVR_VERSION} https://github.com/riscv/riscv-isa-sim/tarball/${FESVR_VERSION} + curl -fL -o $(dir $@)/${FESVR_VERSION} https://github.com/riscv/riscv-isa-sim/tarball/${FESVR_VERSION} tar xfm $(dir $@)${FESVR_VERSION} --strip-components=1 -C $(dir $@) touch $@ @@ -100,17 +100,24 @@ ${WORK_DIR}/compile.vsim.tcl: ${SNLIB_DIR}/rtl_lib.cc ${SNLIB_DIR}/common_lib.cc echo 'return 0' >> $@ # Wrapper script & GUI script +# The generated scripts derive ROOT_DIR from their own location at runtime so +# that they remain portable across different checkout paths (CI runners, moved +# repos). All absolute paths baked in by make are replaced by a single sed pass. define QUESTASIM ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log @! grep -P "Errors: [1-9]*," $(dir $<)vsim.log @mkdir -p $(SIMBIN_DIR) $(SIMBIN_DIR)/logs - @echo "#!/bin/bash" > $(SIMBIN_DIR)/cachepool_cluster.vsim + @echo '#!/bin/bash' > $(SIMBIN_DIR)/cachepool_cluster.vsim + @echo 'ROOT_DIR="$$(cd "$$(dirname "$$(readlink -f "$$0")")/../.." && pwd)"' >> $(SIMBIN_DIR)/cachepool_cluster.vsim @echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $(SIMBIN_DIR)/cachepool_cluster.vsim @echo '${VSIM} +permissive ${VSIM_FLAGS} -do "run -a" -work ${WORK_DIR} -c -ldflags "-Wl,-rpath,${GCC_LIB} -L${FESVR}/lib -lfesvr_vsim -lutil" $1 +permissive-off ++$$1 +PRELOAD=$$1' >> $(SIMBIN_DIR)/cachepool_cluster.vsim + @sed -i 's|$(CACHEPOOL_DIR)|$${ROOT_DIR}|g' $(SIMBIN_DIR)/cachepool_cluster.vsim @chmod +x $(SIMBIN_DIR)/cachepool_cluster.vsim - @echo "#!/bin/bash" > $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @echo '#!/bin/bash' > $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @echo 'ROOT_DIR="$$(cd "$$(dirname "$$(readlink -f "$$0")")/../.." && pwd)"' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @echo '${VSIM} +permissive ${VSIM_FLAGS} -do "log -r /*; source ${WAVE_FILE}; run -a" -work ${WORK_DIR} -ldflags "-Wl,-rpath,${GCC_LIB} -L${FESVR}/lib -lfesvr_vsim -lutil" $1 +permissive-off ++$$1 +PRELOAD=$$1' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @sed -i 's|$(CACHEPOOL_DIR)|$${ROOT_DIR}|g' $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @chmod +x $(SIMBIN_DIR)/cachepool_cluster.vsim.gui endef diff --git a/software/snRuntime/CMakeLists.txt b/software/snRuntime/CMakeLists.txt index ff75a29..2c1b5ea 100644 --- a/software/snRuntime/CMakeLists.txt +++ b/software/snRuntime/CMakeLists.txt @@ -134,4 +134,6 @@ add_snitch_test(printf_simple tests/printf_simple.c) if(SNITCH_RUNTIME STREQUAL "snRuntime-cluster") add_snitch_test(dma_simple tests/dma_simple.c) add_snitch_test(atomics tests/atomics.c) + add_snitch_test(dram_alloc tests/dram_alloc.c) + add_snitch_test(fence tests/fence.c) endif() diff --git a/software/snRuntime/README.md b/software/snRuntime/README.md index d19e74e..b804918 100644 --- a/software/snRuntime/README.md +++ b/software/snRuntime/README.md @@ -1,48 +1,198 @@ -# Snitch Runtime Library +# snRuntime — CachePool Software Runtime + +This library is the bare-metal software runtime for the CachePool manycore system. It is derived from the upstream Snitch runtime and extended with CachePool-specific cache management and peripheral APIs. + +## Folder Structure + +``` +snRuntime/ +├── include/ # Public headers — include these in application code +│ ├── snrt.h # Master header: topology, barriers, DMA, allocation +│ ├── l1cache.h # CachePool L1 data cache management API +│ ├── cachepool_peripheral.h # Register offsets for the cluster peripheral +│ ├── perf_cnt.h # Performance counter API +│ ├── team.h # Team/cluster descriptor structs +│ ├── interface.h # Hardware interface definitions +│ ├── debug.h # Debug printf helpers +│ ├── dm.h # Data-mover (DMA) low-level interface +│ ├── eu.h # Execution unit (work dispatch) interface +│ ├── kmp.h # OpenMP KMP interface +│ └── omp.h # OpenMP runtime interface +├── src/ # Runtime implementation +│ ├── start.S # Entry point (hart 0 boots, others wait for IPI) +│ ├── team.c # Team/topology initialisation +│ ├── barrier.c # Hardware and software barrier implementations +│ ├── l1cache.c # CachePool L1 cache management (flush, partition, xbar) +│ ├── alloc.c # L1 TCDM bump allocator + DRAM linked-list allocator +│ ├── memcpy.c # Optimised memcpy +│ ├── perf_cnt.c # Performance counter helpers +│ ├── printf.c # Lightweight printf (wraps vendor/printf.c) +│ ├── dm.c / dma.c # DMA engine helpers +│ ├── interrupt.c # Interrupt initialisation +│ └── platforms/ # Platform-specific startup and putchar +├── tests/ # Self-contained runtime unit tests +├── vendor/ # Third-party sources (printf, riscv-opcodes) +└── link/ # Linker script template (common.ld.in) +``` + +## Key API + +### Topology (`snrt.h`) + +```c +uint32_t snrt_cluster_core_idx(); // Core index within the cluster (0-based) +uint32_t snrt_cluster_core_num(); // Total cores in the cluster +uint32_t snrt_cluster_tile_idx(); // Tile index within the cluster +uint32_t snrt_cluster_tile_num(); // Number of tiles in the cluster +int snrt_is_compute_core(); // Non-zero if this is a compute (non-DMA) core +``` + +### Synchronisation (`snrt.h`) + +```c +void snrt_cluster_hw_barrier(); // Hardware barrier: stalls until all cluster cores arrive +void snrt_cluster_sw_barrier(); // Software barrier (polling) +void snrt_global_barrier(); // Cluster-to-cluster barrier +``` + +### L1 Data Cache — CachePool-specific (`l1cache.h`) + +All **cluster-wide** functions must be called by **every core** in the cluster. They +internally issue a `fence`, a hardware barrier, execute the operation on core 0 only, +and then issue a final barrier before returning. The low-level single-core variants +(without the `_cluster_` prefix) are for use inside the runtime or in single-core +contexts only. + +#### Cluster-wide flush (recommended for application code) + +```c +void l1d_cluster_flush(); // Flush all banks in all tiles +void l1d_cluster_shared_flush(); // Flush shared banks only +void l1d_cluster_private_flush(uint32_t tile); // Flush private banks of selected tiles (one-hot mask) +``` + +#### Cache configuration (cluster-wide) + +```c +// Set the crossbar interleaving offset (in bits). +// Granularity is clamped to >= log2(cacheline_bytes). +// Example: l1d_xbar_config(6) for 512-bit cachelines (6 = log2(64)). +void l1d_xbar_config(uint32_t offset); + +// Set the number of private banks per tile (0=all-shared … 4=all-private). +void l1d_part(uint32_t size); +``` + +#### Address boundary and polling + +```c +// Set the private/shared address boundary (default 0xA000_0000). +// Addresses >= boundary are private; addresses < boundary are shared. +// Requires a flush before changing while valid data is cached. +void l1d_addr(uint32_t addr); + +// Poll the peripheral until the current flush instruction completes. +// Used by the low-level flush functions; not normally needed in application code. +void l1d_wait(); +``` + +#### Cache initialisation (called once at boot, single-core) + +```c +// Invalidate all cache banks (insn = 2'b11). Called from start_snitch.S. +void l1d_init(uint32_t size); +``` + +### Performance Counters (`perf_cnt.h`) *TODO: REMOVE* + +```c +void snrt_start_perf_counter(enum snrt_perf_cnt, enum snrt_perf_cnt_type, uint32_t hart_id); +void snrt_stop_perf_counter(enum snrt_perf_cnt); +void snrt_reset_perf_counter(enum snrt_perf_cnt); +uint32_t snrt_get_perf_counter(enum snrt_perf_cnt); +``` + +Counter types include cycles, TCDM accesses, TCDM congestion, FPU issues, retired +instructions, DMA bandwidth events, and ICache statistics. + +### Memory Allocation (`snrt.h`) + +Two allocators are provided for different memory regions. + +**L1 TCDM — bump allocator** (no free support): + +```c +void *snrt_l1alloc(size_t size); // Bump-allocate from cluster TCDM scratchpad +void snrt_l1alloc_reset(); // Reclaim all L1 allocations at once +``` + +**DRAM — linked-list allocator** (single-core, supports free + coalescing): + +```c +void *snrt_malloc(size_t size); // Allocate from DRAM; payload rounded up to 64 B +void snrt_free(void *ptr); // Free and coalesce with following free blocks +``` -This library implements a minimal runtime for Snitch systems, which is responsible for the following: +Both the block header and the payload are cacheline-aligned (64 bytes). A request for +any size — even 1 byte — allocates a minimum of 64 bytes of payload. The allocator +must be called by a **single core only**; it is not thread-safe by design since +allocation is expected to happen in single-core initialisation phases. -- Detecting the hardware configuration (cores, clusters, ISA extensions, TCDM) -- Passing a descriptor struct to the executable -- Synchronization across cores and clusters -- Team-based multithreading and work splitting +The heap begins at `_edram + l3off` (set in `snrt_alloc_init`) and grows upward. +Block headers (64 bytes each) are stored in DRAM immediately before their payloads +and are accessed through the L1 cache like any other data. -## General Runtime +### DMA (`snrt.h`) *TODO: REMOVE* -The general runtime (`libsnRuntime`) relies on a bootloader or operating system to load the executable. This usually requires virtual memory to map the segments to the correct addresses. The general runtime does not provide any startup code in this scenario, but is more like a regular library providing some useful API. +```c +snrt_dma_txid_t snrt_dma_start_1d(void *dst, const void *src, size_t size); +snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, size_t size, + size_t dst_stride, size_t src_stride, size_t repeat); +void snrt_dma_wait(snrt_dma_txid_t tid); +void snrt_dma_wait_all(); +``` -## Bare Runtime +## Typical Initialisation Pattern -The bare runtimes (`libsnRuntime-`) assumes that the executable it is being linked into will run in a bare-metal fashion with no convenient bootloader or virtual memory setup. For this scenario, the runtime provides the `_start` symbol and implements a basic crt0. +```c +#include +#include -## Usage +int main() { + const uint32_t cid = snrt_cluster_core_idx(); -The runtime library can be compiled as follows: + // Configure cache xbar and partition — must be called by ALL cores. + l1d_xbar_config(6); // interleave at cacheline granularity + l1d_part(0); // all-shared - mkdir build - cd build - cmake .. - make + // Single-core init: allocate buffers, set up data structures. + if (cid == 0) { + float *buf = (float *)snrt_malloc(N * sizeof(float)); + // ... populate buf, other setup ... + } + snrt_cluster_hw_barrier(); -The tests can be executed as follows: + // ... parallel computation ... - make test + // Flush before reading results back — must be called by ALL cores. + l1d_cluster_flush(); -Interesting CMake options that can be set via `-D