Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/main/scala/cutewrapper/XSCuteTop.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class XSCuteTopImpl(wrapper: XSCuteTop) extends LazyModuleImp(wrapper) {
val matrix_data_in = wrapper.cute_tl.module.io.matrix_data_in.cloneType
})
io.ctrl2top <> cute.io.ctrl2top
io.perf <> cute.io.perf
wrapper.cute_tl.module.io.matrix_data_in <> io.matrix_data_in
wrapper.cute_tl.module.io.mmu <> cute.io.mmu2llc
io.mmu2llc := DontCare
Expand Down
11 changes: 11 additions & 0 deletions src/main/scala/xiangshan/XSCore.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import xiangshan.backend.trace.TraceCoreInterface
import xiangshan.mem._
import xiangshan.cache.mmu._
import xiangshan.cache.mmu.TlbRequestIO
import cute.CutePerfToCoreIO
import scala.collection.mutable.ListBuffer

abstract class XSModule(implicit val p: Parameters) extends Module
Expand Down Expand Up @@ -119,6 +120,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
val dft_reset = Option.when(hasDFT)(Input(new DFTResetSignals()))
val amuCtrl = Option.when(HasMatrixExtension)(Decoupled(new AmuCtrlIO))
val amuRelease = Option.when(HasMatrixExtension)(Flipped(Decoupled(new AmuReleaseIO2XS)))
val cutePerf = Option.when(HasMatrixExtension)(Input(new CutePerfToCoreIO))
})

dontTouch(io.l2_flush_done)
Expand Down Expand Up @@ -204,8 +206,17 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
backend.io.perf.perfEventsLsu := memBlock.io_perf
backend.io.perf.perfEventsHc := memBlock.io.inner_hc_perfEvents
backend.io.perf.perfEventsBackend := DontCare
backend.io.perf.perfEventsMatrixBackend.foreach(_.value := 0.U)
backend.io.perf.perfEventsMatrixMem.foreach(_.value := 0.U)
backend.io.perf.retiredInstr := DontCare
backend.io.perf.ctrlInfo := DontCare
memBlock.io.outer_matrixPerfEvents.foreach(_.value := 0.U)

if (HasMatrixExtension && p(MatAccKey) == MatAcc.CUTE) {
backend.io.perf.perfEventsMatrixBackend := VecInit(io.cutePerf.get.backendEvents.map(_.asTypeOf(new PerfEvent)))
backend.io.perf.perfEventsMatrixMem := VecInit(io.cutePerf.get.memEvents.map(_.asTypeOf(new PerfEvent)))
memBlock.io.outer_matrixPerfEvents := VecInit(io.cutePerf.get.memEvents.map(_.asTypeOf(new PerfEvent)))
}

backend.io.mem.storeDebugInfo <> memBlock.io.mem_to_ooo.storeDebugInfo

Expand Down
7 changes: 6 additions & 1 deletion src/main/scala/xiangshan/XSTile.scala
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import coupledL2.tl2chi.PortIO
import xiangshan.backend.trace.TraceCoreInterface
import xiangshan.backend.fu.matrix._
import xiangshan.backend.fu.matrix.Bundles._
import cute.XSCute
import cute.{CutePerfToCoreIO, XSCute}

object MatAcc extends Enumeration {
type MatAcc = Value
Expand Down Expand Up @@ -258,10 +258,15 @@ class XSTile()(implicit p: Parameters) extends LazyModule

val matrix_data_out = l2top.module.io.matrixDataOut512L2
cute.module.io.matrix_data_in <> matrix_data_out
core.module.io.cutePerf.foreach(_ <> cute.module.io.cute.perf)

cute.module.io.hartId := io.hartId
}

if (HasMatrixExtension && cuteOpt.isEmpty) {
core.module.io.cutePerf.foreach(_ := 0.U.asTypeOf(new CutePerfToCoreIO))
}

if (!HasMatrixExtension) {
l2top.module.io.matrixDataOut512L2 := DontCare
l2top.module.io.matrixDataOut512L2.foreach(_.ready := false.B)
Expand Down
30 changes: 24 additions & 6 deletions src/main/scala/xiangshan/backend/Backend.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1081,7 +1081,8 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame
pfevent.io.distribute_csr := RegNext(csrio.customCtrl.distribute_csr)
val csrevents = pfevent.io.hpmevent.slice(8,16)

val ctrlBlockPerf = ctrlBlock.getPerfEvents
val ctrlBlockPerfBase = ctrlBlock.getPerfEventsBase
val ctrlBlockPerfExt = ctrlBlock.getPerfEventsExt
val intSchedulerPerf = intScheduler.asInstanceOf[SchedulerArithImp].getPerfEvents
val fpSchedulerPerf = fpScheduler.asInstanceOf[SchedulerArithImp].getPerfEvents
val vecSchedulerPerf = vfScheduler.asInstanceOf[SchedulerArithImp].getPerfEvents
Expand All @@ -1095,11 +1096,28 @@ class BackendInlinedImp(override val wrapper: BackendInlined)(implicit p: Parame
val memSchedulerPerf = memScheduler.asInstanceOf[SchedulerMemImp].getPerfEvents
val dataPathPerf = dataPath.getPerfEvents

val perfBackend = Seq()
// let index = 0 be no event
val allPerfEvents = Seq(("noEvent", 0.U)) ++ ctrlBlockPerf ++ dataPathPerf ++
intSchedulerPerf ++ fpSchedulerPerf ++ vecSchedulerPerf ++ mfSchedulerPerf ++
memSchedulerPerf ++ perfBackend
val perfBackend = if (HasMatrixExtension && p(MatAccKey) == MatAcc.CUTE) {
Seq(
("amu_active_cycle", io.perf.perfEventsMatrixBackend(0).value),
("amu_retire", io.perf.perfEventsMatrixBackend(1).value),
("amu_comp_done", io.perf.perfEventsMatrixBackend(2).value),
("amu_release_done", io.perf.perfEventsMatrixBackend(3).value),
("amu_mte_active", io.perf.perfEventsMatrixBackend(4).value),
("amu_mma_nonfp", io.perf.perfEventsMatrixBackend(5).value),
("amu_mma_fp16", io.perf.perfEventsMatrixBackend(6).value),
("amu_mma_bf16", io.perf.perfEventsMatrixBackend(7).value),
("amu_mma_tf32", io.perf.perfEventsMatrixBackend(8).value),
)
} else {
Seq()
}
// Keep kunminghu-v2 backend events as a stable prefix, and append matrix-related events after it.
val perfEventsBase = Seq(("noEvent", 0.U)) ++ ctrlBlockPerfBase ++ dataPathPerf ++
intSchedulerPerf.take(5) ++ fpSchedulerPerf ++ vecSchedulerPerf ++
memSchedulerPerf.take(10)
val perfEventsExt = ctrlBlockPerfExt ++ intSchedulerPerf.drop(5) ++ mfSchedulerPerf ++
memSchedulerPerf.drop(10) ++ perfBackend
val allPerfEvents = perfEventsBase ++ perfEventsExt


if (printEventCoding) {
Expand Down
7 changes: 6 additions & 1 deletion src/main/scala/xiangshan/backend/CtrlBlock.scala
Original file line number Diff line number Diff line change
Expand Up @@ -850,9 +850,14 @@ class CtrlBlockImp(
io.perfInfo.ctrlInfo.fpdqFull := false.B
io.perfInfo.ctrlInfo.lsdqFull := false.B

val perfEvents = Seq(decode, rename, dispatch, rob).flatMap(_.getPerfEvents)
val perfEventsBase = decode.getPerfEvents ++ rename.getPerfEventsBase ++ dispatch.getPerfEvents ++ rob.getPerfEvents
val perfEventsExt = rename.getPerfEventsExt
val perfEvents = perfEventsBase ++ perfEventsExt
generatePerfEvent()

def getPerfEventsBase: Seq[(String, UInt)] = perfEventsBase.map(_._1).zip(io_perf.take(perfEventsBase.length)).map { case (name, perf) => (name, perf.value) }
def getPerfEventsExt: Seq[(String, UInt)] = perfEventsExt.map(_._1).zip(io_perf.drop(perfEventsBase.length)).map { case (name, perf) => (name, perf.value) }

val criticalErrors = rob.getCriticalErrors
generateCriticalErrors()
}
Expand Down
2 changes: 2 additions & 0 deletions src/main/scala/xiangshan/backend/fu/CSR.scala
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ class PerfCounterIO(implicit p: Parameters) extends XSBundle {
val perfEventsBackend = Vec(numCSRPCntCtrl, new PerfEvent)
val perfEventsLsu = Vec(numCSRPCntLsu, new PerfEvent)
val perfEventsHc = Vec(numPCntHc * coreParams.L2NBanks + 1, new PerfEvent)
val perfEventsMatrixBackend = Vec(9, new PerfEvent)
val perfEventsMatrixMem = Vec(12, new PerfEvent)
val retiredInstr = UInt(7.W)
val frontendInfo = new Bundle {
val ibufFull = Bool()
Expand Down
16 changes: 14 additions & 2 deletions src/main/scala/xiangshan/backend/rename/Rename.scala
Original file line number Diff line number Diff line change
Expand Up @@ -868,14 +868,26 @@ class Rename(implicit p: Parameters) extends XSModule with HasCircularQueuePtrHe
("rename_stall_cycle_vec ", inHeadValid && !io.rabCommits.isWalk && dispatchCanAcc && intFreeList.io.canAllocate && fpFreeList.io.canAllocate && v0FreeList.io.canAllocate && vlFreeList.io.canAllocate && !vecFreeList.io.canAllocate),
("rename_stall_cycle_v0 ", inHeadValid && !io.rabCommits.isWalk && dispatchCanAcc && intFreeList.io.canAllocate && fpFreeList.io.canAllocate && vecFreeList.io.canAllocate && vlFreeList.io.canAllocate && !v0FreeList.io.canAllocate),
("rename_stall_cycle_vl ", inHeadValid && !io.rabCommits.isWalk && dispatchCanAcc && intFreeList.io.canAllocate && fpFreeList.io.canAllocate && vecFreeList.io.canAllocate && v0FreeList.io.canAllocate && !vlFreeList.io.canAllocate),
("rename_stall_cycle_mx ", inHeadValid && !io.rabCommits.isWalk && dispatchCanAcc && intFreeList.io.canAllocate && fpFreeList.io.canAllocate && vecFreeList.io.canAllocate && v0FreeList.io.canAllocate && !mxFreeList_io_canAllocate),
)
val intFlPerf = intFreeList.getPerfEvents
val fpFlPerf = fpFreeList.getPerfEvents
val vecFlPerf = vecFreeList.getPerfEvents
val v0FlPerf = v0FreeList.getPerfEvents
val mxFlPerf = OptionWrapper(HasMatrixExtension, mxFreeList.get.getPerfEvents)
val vlFlPerf = vlFreeList.getPerfEvents
val perfEvents = renamePerf ++ intFlPerf ++ fpFlPerf ++ vecFlPerf ++ v0FlPerf ++ mxFlPerf.getOrElse(Seq()) ++ vlFlPerf

val perfEventsBase = renamePerf ++ intFlPerf ++ fpFlPerf ++ vecFlPerf ++ v0FlPerf ++ vlFlPerf
val perfEventsExt = if (HasMatrixExtension) {
Seq(
("rename_stall_cycle_mx ", inHeadValid && !io.rabCommits.isWalk && dispatchCanAcc && intFreeList.io.canAllocate && fpFreeList.io.canAllocate && vecFreeList.io.canAllocate && v0FreeList.io.canAllocate && !mxFreeList_io_canAllocate),
) ++ mxFlPerf.get
} else {
Seq()
}

val perfEvents = perfEventsBase ++ perfEventsExt
generatePerfEvent()

def getPerfEventsBase: Seq[(String, UInt)] = perfEventsBase.map(_._1).zip(io_perf.take(perfEventsBase.length)).map { case (name, perf) => (name, perf.value) }
def getPerfEventsExt: Seq[(String, UInt)] = perfEventsExt.map(_._1).zip(io_perf.drop(perfEventsBase.length)).map { case (name, perf) => (name, perf.value) }
}
21 changes: 20 additions & 1 deletion src/main/scala/xiangshan/mem/MemBlock.scala
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
val outer_beu_errors_icache = Output(new L1BusErrorUnitInfo)
val inner_hc_perfEvents = Output(Vec(numPCntHc * coreParams.L2NBanks + 1, new PerfEvent))
val outer_hc_perfEvents = Input(Vec(numPCntHc * coreParams.L2NBanks + 1, new PerfEvent))
val outer_matrixPerfEvents = Input(Vec(12, new PerfEvent))
val outer_l2PfCtrl = Output(new PrefetchCtrlFromCore)

// reset signals of frontend & backend are generated in memblock
Expand Down Expand Up @@ -2203,8 +2204,26 @@ class MemBlockInlinedImp(outer: MemBlockInlined) extends LazyModuleImp(outer)
val perfFromPTW = perfEventsPTW.map(x => ("PTW_" + x._1, x._2))
val perfBlock = Seq(("ldDeqCount", ldDeqCount),
("stDeqCount", stDeqCount))
val perfMatrix = if (HasMatrixExtension && p(MatAccKey) == MatAcc.CUTE) {
Seq(
("amu_load_a_done", io.outer_matrixPerfEvents(0).value),
("amu_load_b_done", io.outer_matrixPerfEvents(1).value),
("amu_load_c_done", io.outer_matrixPerfEvents(2).value),
("amu_store_done", io.outer_matrixPerfEvents(3).value),
("amu_aml_active", io.outer_matrixPerfEvents(4).value),
("amu_bml_active", io.outer_matrixPerfEvents(5).value),
("amu_cml_load_active", io.outer_matrixPerfEvents(6).value),
("amu_cml_store_active", io.outer_matrixPerfEvents(7).value),
("amu_mem_rd_req", io.outer_matrixPerfEvents(8).value),
("amu_mem_wr_req", io.outer_matrixPerfEvents(9).value),
("amu_mem_rd_32B_req", io.outer_matrixPerfEvents(10).value),
("amu_mem_wr_32B_req", io.outer_matrixPerfEvents(11).value),
)
} else {
Seq()
}
// let index = 0 be no event
val allPerfEvents = Seq(("noEvent", 0.U)) ++ perfFromUnits ++ perfFromTLB ++ perfFromPTW ++ perfBlock
val allPerfEvents = Seq(("noEvent", 0.U)) ++ perfFromUnits ++ perfFromTLB ++ perfFromPTW ++ perfBlock ++ perfMatrix

if (printEventCoding) {
for (((name, inc), i) <- allPerfEvents.zipWithIndex) {
Expand Down
Loading