Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions src/main/scala/Bundles.scala
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,40 @@ object Bundles {
def arithOp() : UInt = "b11".U
}
}

object CutePerfEventCounts {
val Backend = 9
val Mem = 12
}

class TaskControllerPerfProbe(implicit p: Parameters) extends CuteBundle {
val ownedWork = Bool()
val retire = Bool()
val loadADone = Bool()
val loadBDone = Bool()
val loadCDone = Bool()
val storeDone = Bool()
val compDone = Bool()
val releaseDone = Bool()
val mmaNonfpDone = Bool()
val mmaFp16Done = Bool()
val mmaBf16Done = Bool()
val mmaTf32Done = Bool()
val amlActive = Bool()
val bmlActive = Bool()
val cmlLoadActive = Bool()
val mteActive = Bool()
val cmlStoreActive = Bool()
}

class LocalMMUPerfProbe extends Bundle {
val rdReq = Bool()
val wrReq = Bool()
val rd32BReq = UInt(6.W)
val wr32BReq = UInt(6.W)
}

class CutePerfToCoreIO(implicit p: Parameters) extends CuteBundle {
val backendEvents = Vec(CutePerfEventCounts.Backend, UInt(6.W))
val memEvents = Vec(CutePerfEventCounts.Mem, UInt(6.W))
}
24 changes: 24 additions & 0 deletions src/main/scala/CUTETOP.scala
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import org.chipsalliance.cde.config._
class CUTETopIO()(implicit p: Parameters) extends CuteBundle{
val mmu2llc = Flipped(new MMU2TLIO)
val ctrl2top = Flipped(new YGJKControl)
val perf = Output(new CutePerfToCoreIO)
}
class CUTEV2Top()(implicit p: Parameters) extends CuteModule{
val io = IO(new CUTETopIO)
Expand Down Expand Up @@ -150,6 +151,29 @@ class CUTEV2Top()(implicit p: Parameters) extends CuteModule{
MMU.io.LastLevelCacheTLIO <> io.mmu2llc

io.ctrl2top <> TaskCtrl.io.ygjkctrl
val perf = WireInit(0.U.asTypeOf(new CutePerfToCoreIO))
perf.backendEvents(0) := TaskCtrl.io.perfProbe.ownedWork
perf.backendEvents(1) := TaskCtrl.io.perfProbe.retire
perf.backendEvents(2) := TaskCtrl.io.perfProbe.compDone
perf.backendEvents(3) := TaskCtrl.io.perfProbe.releaseDone
perf.backendEvents(4) := TaskCtrl.io.perfProbe.mteActive
perf.backendEvents(5) := TaskCtrl.io.perfProbe.mmaNonfpDone
perf.backendEvents(6) := TaskCtrl.io.perfProbe.mmaFp16Done
perf.backendEvents(7) := TaskCtrl.io.perfProbe.mmaBf16Done
perf.backendEvents(8) := TaskCtrl.io.perfProbe.mmaTf32Done
perf.memEvents(0) := TaskCtrl.io.perfProbe.loadADone
perf.memEvents(1) := TaskCtrl.io.perfProbe.loadBDone
perf.memEvents(2) := TaskCtrl.io.perfProbe.loadCDone
perf.memEvents(3) := TaskCtrl.io.perfProbe.storeDone
perf.memEvents(4) := TaskCtrl.io.perfProbe.amlActive
perf.memEvents(5) := TaskCtrl.io.perfProbe.bmlActive
perf.memEvents(6) := TaskCtrl.io.perfProbe.cmlLoadActive
perf.memEvents(7) := TaskCtrl.io.perfProbe.cmlStoreActive
perf.memEvents(8) := MMU.io.perfProbe.rdReq
perf.memEvents(9) := MMU.io.perfProbe.wrReq
perf.memEvents(10) := MMU.io.perfProbe.rd32BReq
perf.memEvents(11) := MMU.io.perfProbe.wr32BReq
io.perf := RegNext(perf, 0.U.asTypeOf(new CutePerfToCoreIO))

//给每个 MatrixReg 的输入进行默认赋值

Expand Down
9 changes: 9 additions & 0 deletions src/main/scala/LocalMMU.scala
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class LocalMMU()(implicit p: Parameters) extends CuteModule{
val CLoadLocalMMUIO = (new LocalMMUIO)
val CStoreLocalMMUIO = (new LocalMMUIO)
val LastLevelCacheTLIO = Flipped(new MMU2TLIO)
val perfProbe = Output(new LocalMMUPerfProbe)
})

val FirstRequestIndex = RegInit(0.U(LocalMMUTaskType.TaskTypeBitWidth.W))
Expand Down Expand Up @@ -205,4 +206,12 @@ class LocalMMU()(implicit p: Parameters) extends CuteModule{
val cStoreWr = io.CStoreLocalMMUIO.Request.fire & io.CStoreLocalMMUIO.Request.bits.RequestType_isWrite
XSPerfAccumulate("CUTE_MMU_C_rd_request", cLoadRd || cStoreRd)
XSPerfAccumulate("CUTE_MMU_C_wr_request", cLoadWr || cStoreWr)

val outReqFire = io.LastLevelCacheTLIO.Request.fire
val outReqIsWr = io.LastLevelCacheTLIO.Request.bits.RequestType_isWrite
val outReqMask32B = PopCount(io.LastLevelCacheTLIO.Request.bits.RequestMask) >> 5
io.perfProbe.rdReq := outReqFire && !outReqIsWr
io.perfProbe.wrReq := outReqFire && outReqIsWr
io.perfProbe.rd32BReq := Mux(outReqFire && !outReqIsWr, outReqMask32B, 0.U).asUInt
io.perfProbe.wr32BReq := Mux(outReqFire && outReqIsWr, outReqMask32B, 0.U).asUInt
}
23 changes: 23 additions & 0 deletions src/main/scala/TaskController.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class TaskControllerIO(implicit p: Parameters) extends CuteBundle {
val CML_MicroTask_Config = new CMLMicroTaskConfigIO
val MTE_MicroTask_Config = new MTEMicroTaskConfigIO
val DebugTimeStampe = Input(UInt(32.W))
val perfProbe = Output(new TaskControllerPerfProbe)
}

abstract class BaseTaskController(implicit p: Parameters) extends CuteModule {
Expand Down Expand Up @@ -194,6 +195,7 @@ class TaskController(implicit p: Parameters) extends BaseTaskController {

io.MTE_MicroTask_Config.MicroTaskValid := false.B
io.MTE_MicroTask_Config.computeType := MteComputeType.ComputeTypeUndef
io.perfProbe := 0.U.asTypeOf(new TaskControllerPerfProbe)

// ===================== ChiselDB event definitions =====================
private val TileDimWidth = Bundles.Mtilex.width
Expand Down Expand Up @@ -574,6 +576,7 @@ class TaskController(implicit p: Parameters) extends BaseTaskController {
val enqueueFire = decodedFifo.io.deq.fire

val enqueueSlotIdx = Mux(windowFull, winHead, winTail)
val ownedWork = deqValid || (winCount =/= 0.U)

// ===================== Issue dispatch bridge =====================
val issueCtrl = issueSlot.entry.ctrl
Expand Down Expand Up @@ -1411,4 +1414,24 @@ class TaskController(implicit p: Parameters) extends BaseTaskController {
storeEventTable.log(storeFinishEvent, storeFinishEventEn, "StoreFinish", clock, reset)

releaseEventTable.log(releaseIssueEvent, releaseIssueEventEn, "ReleaseIssue", clock, reset)

val mmaDoneType = decodeMmaComputeType(decodeMma(slots(fuCompute.ownerSlot).entry.ctrl))
val releaseDone = issueFire && issueSlot.opKind === TaskCtrlOpKind.Release
io.perfProbe.ownedWork := ownedWork
io.perfProbe.retire := retireFire
io.perfProbe.loadADone := loadAFinishEventEn
io.perfProbe.loadBDone := loadBFinishEventEn
io.perfProbe.loadCDone := loadCFinishEventEn
io.perfProbe.storeDone := storeFinishEventEn
io.perfProbe.compDone := computeWriteCFinishEventEn
io.perfProbe.releaseDone := releaseDone
io.perfProbe.mmaNonfpDone := computeWriteCFinishEventEn && !decodeMma(slots(fuCompute.ownerSlot).entry.ctrl).isfp
io.perfProbe.mmaFp16Done := computeWriteCFinishEventEn && (mmaDoneType === MteComputeType.F16F16F32)
io.perfProbe.mmaBf16Done := computeWriteCFinishEventEn && (mmaDoneType === MteComputeType.BF16BF16F32)
io.perfProbe.mmaTf32Done := computeWriteCFinishEventEn && (mmaDoneType === MteComputeType.TF32TF32F32)
io.perfProbe.amlActive := fuAML.busy
io.perfProbe.bmlActive := fuBML.busy
io.perfProbe.cmlLoadActive := fuCMLLoad.busy
io.perfProbe.mteActive := fuCompute.busy
io.perfProbe.cmlStoreActive := fuCMLStore.busy
}