Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multicluster MemPool #115

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ ifndef config
endif
include $(MEMPOOL_DIR)/config/$(config).mk

# Number of clusters
num_clusters ?= 1

#############################
## Address configuration ##
#############################
Expand Down
49 changes: 49 additions & 0 deletions config/multipool.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2021 ETH Zurich and University of Bologna.
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
# SPDX-License-Identifier: Apache-2.0

# Author: Samuel Riedel, ETH Zurich

###############
## MemPool ##
###############

# Number of cores
num_cores ?= 64

# Number of groups
num_groups ?= 16

# Number of clusters
num_clusters ?= 4

# Number of cores per MemPool tile
num_cores_per_tile ?= 4

# L1 scratchpad banking factor
banking_factor ?= 4

# Radix for hierarchical AXI interconnect
axi_hier_radix ?= 20

# Number of AXI masters per group
axi_masters_per_group ?= 1


#########################
## AXI configuration ##
#########################
# AXI bus data width (in bits)
axi_data_width ?= 512

# Read-only cache line width in AXI interconnect (in bits)
ro_line_width ?= 512

# Number of DMA backends in each group
dmas_per_group ?= 1

# Radix for hierarchical AXI interconnect
axi_hier_radix ?= 2

# Number of AXI masters per group
axi_masters_per_group ?= 1
1 change: 1 addition & 0 deletions hardware/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ vlog_defs += -DNUM_CORES=$(num_cores)
vlog_defs += -DNUM_CORES_PER_TILE=$(num_cores_per_tile)
vlog_defs += -DNUM_DIVSQRT_PER_TILE=$(num_divsqrt_per_tile)
vlog_defs += -DNUM_GROUPS=$(num_groups)
vlog_defs += -DNUM_CLUSTERS=$(num_clusters)
vlog_defs += -DBANKING_FACTOR=$(banking_factor)
vlog_defs += -DL2_BASE=32\'d$(l2_base)
vlog_defs += -DL2_SIZE=32\'d$(l2_size)
Expand Down
38 changes: 19 additions & 19 deletions hardware/src/mempool_cluster.sv
Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,32 @@ module mempool_cluster
parameter int unsigned NumAXIMasters = NumGroups * NumAXIMastersPerGroup
) (
// Clock and reset
input logic clk_i,
input logic rst_ni,
input logic testmode_i,
input logic clk_i,
input logic rst_ni,
input logic testmode_i,
// Scan chain
input logic scan_enable_i,
input logic scan_data_i,
output logic scan_data_o,
input logic scan_enable_i,
input logic scan_data_i,
output logic scan_data_o,
// Wake up signal
input logic [NumCores-1:0] wake_up_i,
input logic [NumCoresPerCluster-1:0] wake_up_i,
// RO-Cache configuration
input ro_cache_ctrl_t ro_cache_ctrl_i,
input ro_cache_ctrl_t ro_cache_ctrl_i,
// DMA request
input dma_req_t dma_req_i,
input logic dma_req_valid_i,
output logic dma_req_ready_o,
input dma_req_t dma_req_i,
input logic dma_req_valid_i,
output logic dma_req_ready_o,
// DMA status
output dma_meta_t dma_meta_o,
output dma_meta_t dma_meta_o,
// AXI Interface
output axi_tile_req_t [NumAXIMasters-1:0] axi_mst_req_o,
input axi_tile_resp_t [NumAXIMasters-1:0] axi_mst_resp_i
output axi_tile_req_t [NumAXIMasters-1:0] axi_mst_req_o,
input axi_tile_resp_t [NumAXIMasters-1:0] axi_mst_resp_i
);

/*********************
* Control Signals *
*********************/
logic [NumCores-1:0] wake_up_q;
logic [NumCoresPerCluster-1:0] wake_up_q;
`FF(wake_up_q, wake_up_i, '0, clk_i, rst_ni);

ro_cache_ctrl_t [NumGroups-1:0] ro_cache_ctrl_q;
Expand Down Expand Up @@ -494,13 +494,13 @@ module mempool_cluster
* Assertions *
****************/

if (NumCores > 1024)
$fatal(1, "[mempool] MemPool is currently limited to 1024 cores.");
if (NumCoresPerCluster > 1024)
$fatal(1, "[mempool] The MemPool cluster is currently limited to 1024 cores.");

if (NumTiles < NumGroups)
if (NumTilesPerCluster < NumGroupsPerCluster)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A condition should be also added on the number of SubGroups. NumTilesPerGroup > NumSubGroups.

$fatal(1, "[mempool] MemPool requires more tiles than groups.");

if (NumCores != NumTiles * NumCoresPerTile)
if (NumCoresPerCluster != NumTilesPerCluster * NumCoresPerTile)
$fatal(1, "[mempool] The number of cores is not divisible by the number of cores per tile.");

if (BankingFactor < 1)
Expand Down
33 changes: 20 additions & 13 deletions hardware/src/mempool_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,23 @@ package mempool_pkg;
`include "axi/assign.svh"
`include "axi/typedef.svh"

localparam integer unsigned NumCores = `ifdef NUM_CORES `NUM_CORES `else 0 `endif;
localparam integer unsigned NumCoresPerTile = `ifdef NUM_CORES_PER_TILE `NUM_CORES_PER_TILE `else 0 `endif;
localparam integer unsigned NumDivsqrtPerTile = `ifdef NUM_DIVSQRT_PER_TILE `NUM_DIVSQRT_PER_TILE `else (snitch_pkg::XDIVSQRT) `endif;
localparam integer unsigned NumGroups = `ifdef NUM_GROUPS `NUM_GROUPS `else 0 `endif;
localparam integer unsigned MAX_NumGroups = 8;
localparam integer unsigned NumTiles = NumCores / NumCoresPerTile;
localparam integer unsigned NumTilesPerGroup = NumTiles / NumGroups;
localparam integer unsigned NumCoresPerGroup = NumCores / NumGroups;
localparam integer unsigned NumCoresPerCache = NumCoresPerTile;
localparam integer unsigned AxiCoreIdWidth = 1;
localparam integer unsigned AxiTileIdWidth = AxiCoreIdWidth+1; // + 1 for cache
localparam integer unsigned AxiDataWidth = `ifdef AXI_DATA_WIDTH `AXI_DATA_WIDTH `else 0 `endif;
localparam integer unsigned AxiLiteDataWidth = 32;
localparam integer unsigned NumCores = `ifdef NUM_CORES `NUM_CORES `else 0 `endif;
localparam integer unsigned NumCoresPerTile = `ifdef NUM_CORES_PER_TILE `NUM_CORES_PER_TILE `else 0 `endif;
localparam integer unsigned NumDivsqrtPerTile = `ifdef NUM_DIVSQRT_PER_TILE `NUM_DIVSQRT_PER_TILE `else (snitch_pkg::XDIVSQRT) `endif;
localparam integer unsigned NumGroups = `ifdef NUM_GROUPS `NUM_GROUPS `else 0 `endif;
localparam integer unsigned NumClusters = `ifdef NUM_CLUSTERS `NUM_CLUSTERS `else 0 `endif;
localparam integer unsigned MAX_NumGroups = 32;
localparam integer unsigned NumGroupsPerCluster = NumGroups / NumClusters;
localparam integer unsigned NumTiles = NumCores / NumCoresPerTile;
localparam integer unsigned NumTilesPerCluster = NumTiles / NumClusters;
localparam integer unsigned NumTilesPerGroup = NumTiles / NumGroups;
localparam integer unsigned NumCoresPerCluster = NumCores / NumClusters;
localparam integer unsigned NumCoresPerGroup = NumCores / NumGroups;
localparam integer unsigned NumCoresPerCache = NumCoresPerTile;
localparam integer unsigned AxiCoreIdWidth = 1;
localparam integer unsigned AxiTileIdWidth = AxiCoreIdWidth+1; // + 1 for cache
localparam integer unsigned AxiDataWidth = `ifdef AXI_DATA_WIDTH `AXI_DATA_WIDTH `else 0 `endif;
localparam integer unsigned AxiLiteDataWidth = 32;

/***********************
* MEMORY PARAMETERS *
Expand All @@ -36,10 +40,13 @@ package mempool_pkg;
localparam integer unsigned DataWidth = 32;
localparam integer unsigned BeWidth = DataWidth / 8;
localparam integer unsigned ByteOffset = $clog2(BeWidth);
// L1 SPM memory
localparam integer unsigned BankingFactor = `ifdef BANKING_FACTOR `BANKING_FACTOR `else 0 `endif;
localparam bit LrScEnable = 1'b1;
localparam integer unsigned TCDMSizePerBank = `ifdef L1_BANK_SIZE `L1_BANK_SIZE `else 0 `endif;
localparam integer unsigned NumBanks = NumCores * BankingFactor;
localparam integer unsigned L1Size = NumCores * BankingFactor * TCDMSizePerBank;
localparam integer unsigned L1SizePerCluster = L1Size / NumClusters;
localparam integer unsigned NumBanksPerTile = NumBanks / NumTiles;
localparam integer unsigned NumBanksPerGroup = NumBanks / NumGroups;
localparam integer unsigned TCDMAddrMemWidth = $clog2(TCDMSizePerBank / mempool_pkg::BeWidth);
Expand Down
110 changes: 58 additions & 52 deletions hardware/src/mempool_system.sv
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,11 @@ module mempool_system
logic [DataWidth-1:0] eoc;
ro_cache_ctrl_t ro_cache_ctrl;

dma_req_t dma_req;
logic dma_req_valid;
logic dma_req_ready;
dma_meta_t dma_meta;
logic [1-1:0] dma_id;
dma_req_t[NumClusters-1:0] dma_req;
logic[NumClusters-1:0] dma_req_valid;
logic[NumClusters-1:0] dma_req_ready;
dma_meta_t[NumClusters-1:0] dma_meta;
logic[NumClusters-1:0][1-1:0] dma_id;

localparam xbar_cfg_t MstDemuxCfg = '{
NoSlvPorts : 1, // Each master has a private demux
Expand Down Expand Up @@ -132,26 +132,27 @@ module mempool_system
/*********************
* MemPool Cluster *
********************/

mempool_cluster #(
.TCDMBaseAddr(TCDMBaseAddr),
.BootAddr (BootAddr )
) i_mempool_cluster (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.wake_up_i (wake_up ),
.testmode_i (1'b0 ),
.scan_enable_i (1'b0 ),
.scan_data_i (1'b0 ),
.scan_data_o (/* Unused */ ),
.ro_cache_ctrl_i(ro_cache_ctrl ),
.dma_req_i (dma_req ),
.dma_req_valid_i(dma_req_valid ),
.dma_req_ready_o(dma_req_ready ),
.dma_meta_o (dma_meta ),
.axi_mst_req_o (axi_mst_req[NumAXIMasters-2:0] ),
.axi_mst_resp_i (axi_mst_resp[NumAXIMasters-2:0])
);
for (genvar i = 0; i < NumClusters; i++) begin : gen_clusters
mempool_cluster #(
.TCDMBaseAddr(i*L1SizePerCluster),
.BootAddr (BootAddr )
) i_mempool_cluster (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.wake_up_i (wake_up[i*NumCoresPerCluster+:NumCoresPerCluster] ),
.testmode_i (1'b0 ),
.scan_enable_i (1'b0 ),
.scan_data_i (1'b0 ),
.scan_data_o (/* Unused */ ),
.ro_cache_ctrl_i(ro_cache_ctrl ),
.dma_req_i (dma_req[i] ),
.dma_req_valid_i(dma_req_valid[i] ),
.dma_req_ready_o(dma_req_ready[i] ),
.dma_meta_o (dma_meta[i] ),
.axi_mst_req_o (axi_mst_req[i*NumAXIMastersPerGroup+:NumAXIMastersPerGroup] ),
.axi_mst_resp_i (axi_mst_resp[i*NumAXIMastersPerGroup+:NumAXIMastersPerGroup])
);
end

/**********************
* AXI Interconnect *
Expand Down Expand Up @@ -658,12 +659,14 @@ module mempool_system
* Control Registers *
***********************/

localparam NumPeriphs = 2; // Control registers + DMA
localparam NumPeriphs = 1 + NumClusters; // Control registers + (NumClusters * DMA)

typedef enum logic [$clog2(NumPeriphs) - 1:0] {
CtrlRegisters,
DMA
} axi_lite_xbar_slave_target;
localparam CtrlRegisters = 0;
localparam DMA = 1;
// typedef enum logic [$clog2(NumPeriphs) - 1:0] {
// CtrlRegisters,
// DMA
// } axi_lite_xbar_slave_target;

axi_periph_req_t axi_periph_narrow_req;
axi_periph_resp_t axi_periph_narrow_resp;
Expand Down Expand Up @@ -692,12 +695,13 @@ module mempool_system
localparam addr_t CtrlRegistersEndAddr = 32'h4001_0000;
localparam addr_t DMABaseAddr = 32'h4001_0000;
localparam addr_t DMAEndAddr = 32'h4002_0000;
localparam addr_t DMARangeAddr = DMAEndAddr - DMABaseAddr;

xbar_rule_32_t [NumPeriphs-1:0] axi_lite_xbar_rules;
assign axi_lite_xbar_rules = '{
'{idx: CtrlRegisters, start_addr: CtrlRegistersBaseAddr, end_addr: CtrlRegistersEndAddr},
'{idx: DMA, start_addr: DMABaseAddr, end_addr: DMAEndAddr}
};
assign axi_lite_xbar_rules[CtrlRegisters] = '{idx: CtrlRegisters, start_addr: CtrlRegistersBaseAddr, end_addr: CtrlRegistersEndAddr};
for (genvar i = 0; i < NumClusters; i++) begin : gen_dma_addr_map
assign axi_lite_xbar_rules[DMA + i] = '{idx: DMA + i, start_addr: DMABaseAddr+(i*DMARangeAddr), end_addr: DMAEndAddr+(i*DMARangeAddr)};
end

axi_dw_converter #(
.AxiMaxReads (1 ), // Number of outstanding reads
Expand Down Expand Up @@ -789,24 +793,26 @@ module mempool_system
.ro_cache_ctrl_o (ro_cache_ctrl )
);

mempool_dma #(
.axi_lite_req_t(axi_lite_slv_req_t ),
.axi_lite_rsp_t(axi_lite_slv_resp_t ),
.burst_req_t (dma_req_t ),
.NumBackends (NumGroups ),
.DmaIdWidth (1 )
) i_mempool_dma (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.config_req_i (axi_lite_slv_req[DMA] ),
.config_res_o (axi_lite_slv_resp[DMA] ),
.burst_req_o (dma_req ),
.valid_o (dma_req_valid ),
.ready_i (dma_req_ready ),
.backend_idle_i (dma_meta.backend_idle ),
.trans_complete_i(dma_meta.trans_complete),
.dma_id_o (dma_id )
);
for (genvar i = 0; i < NumClusters; i++) begin : gen_mempool_dma
mempool_dma #(
.axi_lite_req_t(axi_lite_slv_req_t ),
.axi_lite_rsp_t(axi_lite_slv_resp_t ),
.burst_req_t (dma_req_t ),
.NumBackends (NumGroupsPerCluster ),
.DmaIdWidth (1 )
) i_mempool_dma (
.clk_i (clk_i ),
.rst_ni (rst_ni ),
.config_req_i (axi_lite_slv_req[DMA+i] ),
.config_res_o (axi_lite_slv_resp[DMA+i] ),
.burst_req_o (dma_req[i] ),
.valid_o (dma_req_valid[i] ),
.ready_i (dma_req_ready[i] ),
.backend_idle_i (dma_meta[i].backend_idle ),
.trans_complete_i(dma_meta[i].trans_complete),
.dma_id_o (dma_id[i] )
);
end

assign busy_o = 1'b0;

Expand Down
Loading