From 5acdab1fc43875cfda02f2d96ba8e41304bd629f Mon Sep 17 00:00:00 2001 From: ecall73 Date: Wed, 3 Jun 2026 01:16:49 +0800 Subject: [PATCH] feat(hpm): Add hpmevent for CUTE --- src/main/scala/Bundles.scala | 37 +++++++++++++++++++++++++++++ src/main/scala/CUTETOP.scala | 24 +++++++++++++++++++ src/main/scala/LocalMMU.scala | 9 +++++++ src/main/scala/TaskController.scala | 23 ++++++++++++++++++ 4 files changed, 93 insertions(+) diff --git a/src/main/scala/Bundles.scala b/src/main/scala/Bundles.scala index cd8d3de..d0cb184 100644 --- a/src/main/scala/Bundles.scala +++ b/src/main/scala/Bundles.scala @@ -206,3 +206,40 @@ object Bundles { def arithOp() : UInt = "b11".U } } + +object CutePerfEventCounts { + val Backend = 9 + val Mem = 12 +} + +class TaskControllerPerfProbe(implicit p: Parameters) extends CuteBundle { + val ownedWork = Bool() + val retire = Bool() + val loadADone = Bool() + val loadBDone = Bool() + val loadCDone = Bool() + val storeDone = Bool() + val compDone = Bool() + val releaseDone = Bool() + val mmaNonfpDone = Bool() + val mmaFp16Done = Bool() + val mmaBf16Done = Bool() + val mmaTf32Done = Bool() + val amlActive = Bool() + val bmlActive = Bool() + val cmlLoadActive = Bool() + val mteActive = Bool() + val cmlStoreActive = Bool() +} + +class LocalMMUPerfProbe extends Bundle { + val rdReq = Bool() + val wrReq = Bool() + val rd32BReq = UInt(6.W) + val wr32BReq = UInt(6.W) +} + +class CutePerfToCoreIO(implicit p: Parameters) extends CuteBundle { + val backendEvents = Vec(CutePerfEventCounts.Backend, UInt(6.W)) + val memEvents = Vec(CutePerfEventCounts.Mem, UInt(6.W)) +} diff --git a/src/main/scala/CUTETOP.scala b/src/main/scala/CUTETOP.scala index a469d5f..654c966 100644 --- a/src/main/scala/CUTETOP.scala +++ b/src/main/scala/CUTETOP.scala @@ -10,6 +10,7 @@ import org.chipsalliance.cde.config._ class CUTETopIO()(implicit p: Parameters) extends CuteBundle{ val mmu2llc = Flipped(new MMU2TLIO) val ctrl2top = Flipped(new YGJKControl) + val perf = Output(new CutePerfToCoreIO) } class CUTEV2Top()(implicit p: Parameters) extends CuteModule{ val io = IO(new CUTETopIO) @@ -150,6 +151,29 @@ class CUTEV2Top()(implicit p: Parameters) extends CuteModule{ MMU.io.LastLevelCacheTLIO <> io.mmu2llc io.ctrl2top <> TaskCtrl.io.ygjkctrl + val perf = WireInit(0.U.asTypeOf(new CutePerfToCoreIO)) + perf.backendEvents(0) := TaskCtrl.io.perfProbe.ownedWork + perf.backendEvents(1) := TaskCtrl.io.perfProbe.retire + perf.backendEvents(2) := TaskCtrl.io.perfProbe.compDone + perf.backendEvents(3) := TaskCtrl.io.perfProbe.releaseDone + perf.backendEvents(4) := TaskCtrl.io.perfProbe.mteActive + perf.backendEvents(5) := TaskCtrl.io.perfProbe.mmaNonfpDone + perf.backendEvents(6) := TaskCtrl.io.perfProbe.mmaFp16Done + perf.backendEvents(7) := TaskCtrl.io.perfProbe.mmaBf16Done + perf.backendEvents(8) := TaskCtrl.io.perfProbe.mmaTf32Done + perf.memEvents(0) := TaskCtrl.io.perfProbe.loadADone + perf.memEvents(1) := TaskCtrl.io.perfProbe.loadBDone + perf.memEvents(2) := TaskCtrl.io.perfProbe.loadCDone + perf.memEvents(3) := TaskCtrl.io.perfProbe.storeDone + perf.memEvents(4) := TaskCtrl.io.perfProbe.amlActive + perf.memEvents(5) := TaskCtrl.io.perfProbe.bmlActive + perf.memEvents(6) := TaskCtrl.io.perfProbe.cmlLoadActive + perf.memEvents(7) := TaskCtrl.io.perfProbe.cmlStoreActive + perf.memEvents(8) := MMU.io.perfProbe.rdReq + perf.memEvents(9) := MMU.io.perfProbe.wrReq + perf.memEvents(10) := MMU.io.perfProbe.rd32BReq + perf.memEvents(11) := MMU.io.perfProbe.wr32BReq + io.perf := RegNext(perf, 0.U.asTypeOf(new CutePerfToCoreIO)) //给每个 MatrixReg 的输入进行默认赋值 diff --git a/src/main/scala/LocalMMU.scala b/src/main/scala/LocalMMU.scala index 5cbd1a2..abeb35c 100644 --- a/src/main/scala/LocalMMU.scala +++ b/src/main/scala/LocalMMU.scala @@ -14,6 +14,7 @@ class LocalMMU()(implicit p: Parameters) extends CuteModule{ val CLoadLocalMMUIO = (new LocalMMUIO) val CStoreLocalMMUIO = (new LocalMMUIO) val LastLevelCacheTLIO = Flipped(new MMU2TLIO) + val perfProbe = Output(new LocalMMUPerfProbe) }) val FirstRequestIndex = RegInit(0.U(LocalMMUTaskType.TaskTypeBitWidth.W)) @@ -205,4 +206,12 @@ class LocalMMU()(implicit p: Parameters) extends CuteModule{ val cStoreWr = io.CStoreLocalMMUIO.Request.fire & io.CStoreLocalMMUIO.Request.bits.RequestType_isWrite XSPerfAccumulate("CUTE_MMU_C_rd_request", cLoadRd || cStoreRd) XSPerfAccumulate("CUTE_MMU_C_wr_request", cLoadWr || cStoreWr) + + val outReqFire = io.LastLevelCacheTLIO.Request.fire + val outReqIsWr = io.LastLevelCacheTLIO.Request.bits.RequestType_isWrite + val outReqMask32B = PopCount(io.LastLevelCacheTLIO.Request.bits.RequestMask) >> 5 + io.perfProbe.rdReq := outReqFire && !outReqIsWr + io.perfProbe.wrReq := outReqFire && outReqIsWr + io.perfProbe.rd32BReq := Mux(outReqFire && !outReqIsWr, outReqMask32B, 0.U).asUInt + io.perfProbe.wr32BReq := Mux(outReqFire && outReqIsWr, outReqMask32B, 0.U).asUInt } diff --git a/src/main/scala/TaskController.scala b/src/main/scala/TaskController.scala index 11eb992..b6fc9de 100644 --- a/src/main/scala/TaskController.scala +++ b/src/main/scala/TaskController.scala @@ -30,6 +30,7 @@ class TaskControllerIO(implicit p: Parameters) extends CuteBundle { val CML_MicroTask_Config = new CMLMicroTaskConfigIO val MTE_MicroTask_Config = new MTEMicroTaskConfigIO val DebugTimeStampe = Input(UInt(32.W)) + val perfProbe = Output(new TaskControllerPerfProbe) } abstract class BaseTaskController(implicit p: Parameters) extends CuteModule { @@ -194,6 +195,7 @@ class TaskController(implicit p: Parameters) extends BaseTaskController { io.MTE_MicroTask_Config.MicroTaskValid := false.B io.MTE_MicroTask_Config.computeType := MteComputeType.ComputeTypeUndef + io.perfProbe := 0.U.asTypeOf(new TaskControllerPerfProbe) // ===================== ChiselDB event definitions ===================== private val TileDimWidth = Bundles.Mtilex.width @@ -574,6 +576,7 @@ class TaskController(implicit p: Parameters) extends BaseTaskController { val enqueueFire = decodedFifo.io.deq.fire val enqueueSlotIdx = Mux(windowFull, winHead, winTail) + val ownedWork = deqValid || (winCount =/= 0.U) // ===================== Issue dispatch bridge ===================== val issueCtrl = issueSlot.entry.ctrl @@ -1411,4 +1414,24 @@ class TaskController(implicit p: Parameters) extends BaseTaskController { storeEventTable.log(storeFinishEvent, storeFinishEventEn, "StoreFinish", clock, reset) releaseEventTable.log(releaseIssueEvent, releaseIssueEventEn, "ReleaseIssue", clock, reset) + + val mmaDoneType = decodeMmaComputeType(decodeMma(slots(fuCompute.ownerSlot).entry.ctrl)) + val releaseDone = issueFire && issueSlot.opKind === TaskCtrlOpKind.Release + io.perfProbe.ownedWork := ownedWork + io.perfProbe.retire := retireFire + io.perfProbe.loadADone := loadAFinishEventEn + io.perfProbe.loadBDone := loadBFinishEventEn + io.perfProbe.loadCDone := loadCFinishEventEn + io.perfProbe.storeDone := storeFinishEventEn + io.perfProbe.compDone := computeWriteCFinishEventEn + io.perfProbe.releaseDone := releaseDone + io.perfProbe.mmaNonfpDone := computeWriteCFinishEventEn && !decodeMma(slots(fuCompute.ownerSlot).entry.ctrl).isfp + io.perfProbe.mmaFp16Done := computeWriteCFinishEventEn && (mmaDoneType === MteComputeType.F16F16F32) + io.perfProbe.mmaBf16Done := computeWriteCFinishEventEn && (mmaDoneType === MteComputeType.BF16BF16F32) + io.perfProbe.mmaTf32Done := computeWriteCFinishEventEn && (mmaDoneType === MteComputeType.TF32TF32F32) + io.perfProbe.amlActive := fuAML.busy + io.perfProbe.bmlActive := fuBML.busy + io.perfProbe.cmlLoadActive := fuCMLLoad.busy + io.perfProbe.mteActive := fuCompute.busy + io.perfProbe.cmlStoreActive := fuCMLStore.busy }