/** * Copyright (c) 2024 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ /*! * \file matmul_client.h * \brief */ #ifndef __MATMUL_CLIENT_H__ #define __MATMUL_CLIENT_H__ #include "lib/matmul/tiling.h" #include "lib/matmul/matmul_call_back.h" #include "../../impl/matmul/matmul_utils.h" #include "kernel_operator.h" #if ASCENDC_CPU_DEBUG #include "lib/matmul/matmul_server.h" #endif namespace matmul { using namespace AscendC; #if ASCENDC_CPU_DEBUG template constexpr bool IsSharedMatmul() { return !MM_CFG.enableInit; } template > struct MatmulInstBase { __aicore__ inline MatmulInstBase(){}; }; template struct MatmulInstShared : MatmulInstBase { __aicore__ inline MatmulInstShared(){}; matmul::MatmulService mm[1]; }; template struct MatmulInst : MatmulInstBase { __aicore__ inline MatmulInst(){}; matmul::MatmulService mm[MIX_NUM]; }; template struct MatmulInstAux { __aicore__ inline MatmulInstAux(){}; }; template struct MatmulInstAux { __aicore__ inline MatmulInstAux(){}; using MATMUL = MatmulInstShared; }; template struct MatmulInstAux { __aicore__ inline MatmulInstAux(){}; using MATMUL = MatmulInst; }; #endif constexpr int32_t VECTOR_QUANT_MODE = 2; // Service function of the Matmul on the AIV client side, which is the unit for sending messages. template > class MatmulClient { using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; public: __aicore__ inline void Init(TCubeTiling* tiling, TPipe* tpipe = nullptr) { ASSERT(sizeof(KfcMsg) % CACHE_LINE_SIZE == 0); ASSERT(tiling != nullptr && "tiling cannot be nullptr when init matmul client"); ASSERT(sizeof(TCubeTiling) % sizeof(uint64_t) == 0); constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE; int32_t ubAddr = -1; GM_ADDR tilingGM = client->AllocUB(tCubeTilingSize, ubAddr); auto tempTilingGM = reinterpret_cast<__gm__ uint32_t*>(tilingGM); auto tempTiling = reinterpret_cast(tiling); for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); ++i, ++tempTilingGM, ++tempTiling) { *tempTilingGM = *tempTiling; } this->tiling = tiling; GlobalTensor global; for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) { Barrier(); global.SetGlobalBuffer((__gm__ int64_t*)(tilingGM + i)); DataCacheCleanAndInvalid(global); } Barrier(); auto msg = client->AllocMessage(); client->ubMsg->tilingInfo.tilingAddr = tilingGM; client->ubMsg->head = KfcMsgMakeFlag(KFC_Enum::MMFUN_INIT, this->instIdx); client->ubMsg->ubAddr = ubAddr; client->PostMessage(msg); // Initialize the local client after the expected processing is complete. *((uint64_t*)&kfcMsg_) = 0; *((uint64_t*)&(kfcMsg_.body)) = 0; nIter_ = ConstCeil(tiling->singleCoreN, tiling->baseN); mIter_ = ConstCeil(tiling->singleCoreM, tiling->baseM); mnIter_ = nIter_ * mIter_; cacheWorkspaceAddr = nullptr; } template __aicore__ inline void SetWorkspace(GlobalTensor addr) { ASSERT(addr.GetSize() > 0); SetWorkspace(addr.GetPhyAddr(), addr.GetSize() * sizeof(T)); } template __aicore__ inline void SetWorkspace(__gm__ const T* addr, int len) { ASSERT(addr != nullptr); ASSERT(this->tiling != nullptr); uint64_t offset = mnIter_ * tiling->baseN * tiling->baseM * sizeof(DstT); cacheWorkspaceAddr = reinterpret_cast(const_cast<__gm__ T*>(addr)); cOffset_ = 0; } __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK) { SetOrgShape(orgM, orgN, orgK, orgK, orgN); } __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0) { kfcMsg_.orgShape.orgM = orgM; kfcMsg_.orgShape.orgN = orgN; kfcMsg_.orgShape.orgKa = orgKa; kfcMsg_.orgShape.orgKb = orgKb; kfcMsg_.orgShape.orgKc = orgKc; PostMessage(); } __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK) { SetTail(singleM, singleN, singleK); } __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1) { if (tailM != -1) { mIter_ = ConstCeil(tailM, tiling->baseM); } if (tailN != -1) { nIter_ = ConstCeil(tailN, tiling->baseN); } mnIter_ = nIter_ * mIter_; kfcMsg_.body.singleM = tailM; kfcMsg_.body.singleN = tailN; kfcMsg_.body.singleK = tailK; kfcMsg_.body.setTail = 1; } // transMode only support 0 or 1 // 0: round mode is round to the nearest tie to even // 1: round mode is round to the nearest tie away from zero __aicore__ inline void SetHF32(bool enHF32 = false, int32_t transMode = 0) { kfcMsg_.body.enHF32 = enHF32; kfcMsg_.body.hf32TransMode = transMode; PostMessage(); } __aicore__ inline void SetTensorA(const LocalTensor& a, bool isTranspose = false) { ASSERT(isTranspose <= A_TYPE::isTrans && "It is not allowed to do A transpose when matmul A transpose is not defined."); kfcMsg_.body.isTransA = static_cast(isTranspose); kfcMsg_.body.setTensorA = 1; kfcMsg_.body.isFirstIter = 1; if constexpr (A_TYPE::pos == TPosition::TSCM) { kfcMsg_.body.aAddr = GetTscmAddr(a); kfcMsg_.body.sizeAmatrix = a.GetSize() * sizeof(SrcT); } else { kfcMsg_.body.aAddr = GetGlobalAddr(a); kfcMsg_.body.sizeAmatrix = a.GetSize() * sizeof(SrcT); } } __aicore__ inline void SetTensorAWithCopy(const GlobalTensor& gm, const LocalTensor& leftMatrix, bool isTranspose = false) { ASSERT(A_TYPE::pos != TPosition::TSCM); kfcMsg_.body.isTransA = static_cast(isTranspose); kfcMsg_.body.setTensorA = 1; kfcMsg_.body.isFirstIter = 1; kfcMsg_.body.aAddr = GetGMAddrAndCopyUB(gm.GetPhyAddr(), leftMatrix); kfcMsg_.body.sizeAmatrix = leftMatrix.GetSize() * sizeof(SrcT); } __aicore__ inline void SetTensorB(const LocalTensor& b, bool isTranspose = false) { ASSERT(isTranspose <= B_TYPE::isTrans && "It is not allowed to do B transpose when matmul B transpose is not defined."); kfcMsg_.body.isTransB = static_cast(isTranspose); kfcMsg_.body.setTensorB = 1; kfcMsg_.body.isFirstIter = 1; if constexpr (B_TYPE::pos == TPosition::TSCM) { kfcMsg_.body.bAddr = GetTscmAddr(b); kfcMsg_.body.sizeBmatrix = b.GetSize() * sizeof(SrcT); } else { kfcMsg_.body.bAddr = GetGlobalAddr(b); kfcMsg_.body.sizeBmatrix = b.GetSize() * sizeof(SrcT); } } __aicore__ inline void SetTensorBWithCopy(const GlobalTensor& gm, const LocalTensor& righMatrix, bool isTranspose = false) { ASSERT(A_TYPE::pos != TPosition::TSCM); kfcMsg_.body.isTransB = static_cast(isTranspose); kfcMsg_.body.setTensorB = 1; kfcMsg_.body.isFirstIter = 1; kfcMsg_.body.bAddr = GetGMAddrAndCopyUB(gm.GetPhyAddr(), righMatrix); kfcMsg_.body.sizeBmatrix = righMatrix.GetSize() * sizeof(SrcT); } __aicore__ inline void SetBias(const LocalTensor& bias) { kfcMsg_.body.setTensorBias = 1; if constexpr (BIAS_TYPE::pos == TPosition::TSCM) { kfcMsg_.body.biasAddr = GetTscmAddr(bias); } else { kfcMsg_.body.biasAddr = GetGlobalAddr(bias); } }; __aicore__ inline void SetTensorA(const GlobalTensor& a, bool isTranspose = false) { ASSERT(isTranspose <= A_TYPE::isTrans && "It is not allowed to do A transpose when matmul A transpose is not defined."); kfcMsg_.body.isTransA = static_cast(isTranspose); kfcMsg_.body.aAddr = reinterpret_cast(a.GetPhyAddr()); kfcMsg_.body.sizeAmatrix = a.GetSize() * sizeof(SrcT); kfcMsg_.body.setTensorA = 1; kfcMsg_.body.isFirstIter = 1; } __aicore__ inline void SetTensorB(const GlobalTensor& b, bool isTranspose = false) { ASSERT(isTranspose <= B_TYPE::isTrans && "It is not allowed to do B transpose when matmul B transpose is not defined."); kfcMsg_.body.isTransB = static_cast(isTranspose); kfcMsg_.body.bAddr = reinterpret_cast(b.GetPhyAddr()); kfcMsg_.body.sizeBmatrix = b.GetSize() * sizeof(SrcT); kfcMsg_.body.setTensorB = 1; kfcMsg_.body.isFirstIter = 1; } __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) { kfcMsg_.body.dataPtr = dataPtr; } __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) { kfcMsg_.userDefInfo.tilingPtr = tilingPtr; PostMessage(); } __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) { kfcMsg_.body.setQuant = 1; kfcMsg_.body.quantMode = 1; kfcMsg_.body.quantScalar = quantScalar; } __aicore__ inline void SetQuantVector(const GlobalTensor& quantTensor) { kfcMsg_.body.setQuant = 1; kfcMsg_.body.quantMode = VECTOR_QUANT_MODE; kfcMsg_.body.quantAddr = reinterpret_cast(quantTensor.GetPhyAddr()); kfcMsg_.body.quantSize = quantTensor.GetSize() * sizeof(uint64_t); } __aicore__ inline void SetBias(const GlobalTensor& bias) { kfcMsg_.body.biasAddr = reinterpret_cast(bias.GetPhyAddr()); kfcMsg_.body.setTensorBias = 1; } __aicore__ inline void SetTensorA(SrcT aScalar) { auto temp1 = (uint8_t*)&(aScalar); auto temp2 = reinterpret_cast(&(kfcMsg_.body.aAddr)); for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { *temp2 = *temp1; } kfcMsg_.body.setTensorA = 1; } __aicore__ inline void SetTensorB(SrcT bScalar) { auto temp1 = (uint8_t*)&(bScalar); auto temp2 = reinterpret_cast(&(kfcMsg_.body.aAddr)); for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { *temp2 = *temp1; } kfcMsg_.body.setTensorB = 1; } __aicore__ inline void ClearBias() { kfcMsg_.body.setClearBias = 1; } __aicore__ inline void End() { if (isSyncGetC) { PostMessage(); } } template __aicore__ inline bool Iterate(bool enPartialSum = false) { TRACE_START(TraceId::KFC_CLIENT_POST_MSG); if (unlikely(kfcMsg_.body.isFirstIter)) { cntIter_ = 0; cOffset_ = 0; curProcess = 0; *((__gm__ uint64_t*)mmCntAddr_) = 0; GlobalTensor global; global.SetGlobalBuffer((__gm__ uint64_t*)mmCntAddr_); DataCacheCleanAndInvalid(global); } else { if (++cntIter_ >= mnIter_) { TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); return false; } if constexpr (!sync) { TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); return true; } } if constexpr (!sync) { // Asynchronous mode. Only UB. ASSERT(cacheWorkspaceAddr != 0); // The cache address must be configured in asynchronous mode. ASSERT(PhyPosIsUB(C_TYPE::pos)); // Asynchronous mode. Only UB. } isSyncGetC = sync; // Synchronous mode. no cache for the first time kfcMsg_.body.enPartialSum = enPartialSum; kfcMsg_.body.sync = sync; kfcMsg_.body.cAddr = reinterpret_cast(cacheWorkspaceAddr); PostMessage(); TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); return true; } // Only support the mode that the IterateAll is asynchronous and GM output is continuous. // In discontinuous scenarios, the system stops responding. __aicore__ inline void WaitIterateAll() { ASSERT(!isSyncGetC); // Must be asynchronous mode WaitEvent(this->devEvtID); } // Only support the mode that the IterateAll is asynchronous and GM output is continuous. // In discontinuous scenarios, the system stops responding. __aicore__ inline void WaitIterateBatch() { ASSERT(!isSyncGetC); // Must be asynchronous mode WaitEvent(this->devEvtID); } template __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) { TRACE_START(TraceId::KFC_CLIENT_POST_MSG); ASSERT(kfcMsg_.body.isFirstIter == 1); kfcMsg_.body.iterateFakeMsg = fakeMsg; kfcMsg_.body.cAddr = reinterpret_cast(gm.GetPhyAddr()); kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); kfcMsg_.body.sync = sync; kfcMsg_.body.enSequentialWrite = enSequentialWrite; kfcMsg_.body.waitIterateAll = waitIterateAll; PostMessage(); if constexpr (sync) { WaitEvent(this->devEvtID); } isSyncGetC = sync; TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); } template __aicore__ inline void IterateAll(const LocalTensor& cMatrix, uint8_t enAtomic = 0) { TRACE_START(TraceId::KFC_CLIENT_POST_MSG); ASSERT(sync == true); ASSERT(enAtomic == 0); ASSERT(kfcMsg_.body.isFirstIter == 1); ASSERT((PhyPosIsL1(C_TYPE::pos)) && "IterateAll LocalTensor only support QuePosition A1 or B1"); if (cMatrix.GetPosition() == static_cast(TPosition::TSCM)) { kfcMsg_.body.cAddr = GetTscmAddr(cMatrix); kfcMsg_.body.cIsTscm = 1; } else { kfcMsg_.body.cAddr = GetGlobalAddr(cMatrix); } kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); kfcMsg_.body.sync = sync; ASSERT(kfcMsg_.body.enSequentialWrite == 0); GM_ADDR gmDataAddr = reinterpret_cast(kfcMsg_.body.cAddr); *((__gm__ uint64_t*)mmCntAddr_) = 0; GlobalTensor mmCntGlobal; mmCntGlobal.SetGlobalBuffer((__gm__ uint64_t*)mmCntAddr_); DataCacheCleanAndInvalid(mmCntGlobal); PostMessage(); if constexpr (sync) { WaitEvent(this->devEvtID); CopyToUB(cMatrix, gmDataAddr, cMatrix.GetSize()); } isSyncGetC = sync; TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); } template __aicore__ inline void IterateBatch(const GlobalTensor& gm, uint32_t batchA, uint32_t batchB, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) { TRACE_START(TraceId::KFC_CLIENT_POST_MSG); ASSERT(kfcMsg_.body.isFirstIter == 1); kfcMsg_.body.cAddr = reinterpret_cast(gm.GetPhyAddr()); kfcMsg_.body.enSequentialWrite = enSequentialWrite; kfcMsg_.body.sync = sync; kfcMsg_.body.batchA = batchA; kfcMsg_.body.batchB = batchB; kfcMsg_.body.matrixStrideA = matrixStrideA; kfcMsg_.body.matrixStrideB = matrixStrideB; kfcMsg_.body.matrixStrideC = matrixStrideC; kfcMsg_.body.waitIterateBatch = waitIterateBatch; kfcMsg_.body.setBatch = 1; *((__gm__ uint64_t*)mmCntAddr_) = 0; GlobalTensor mmCntGlobal; mmCntGlobal.SetGlobalBuffer((__gm__ uint64_t*)mmCntAddr_); DataCacheCleanAndInvalid(mmCntGlobal); PostMessage(); if constexpr (sync) { WaitEvent(this->devEvtID); } isSyncGetC = sync; TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); } template __aicore__ inline void IterateBatch(const LocalTensor& ubCmatrix, uint32_t batchA, uint32_t batchB, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) { TRACE_START(TraceId::KFC_CLIENT_POST_MSG); ASSERT(sync == true); ASSERT(kfcMsg_.body.isFirstIter == 1); if (ubCmatrix.GetPosition() == static_cast(TPosition::TSCM)) { kfcMsg_.body.cAddr = GetTscmAddr(ubCmatrix); kfcMsg_.body.cIsTscm = 1; } else { kfcMsg_.body.cAddr = GetGlobalAddr(ubCmatrix); } kfcMsg_.body.enSequentialWrite = enSequentialWrite; kfcMsg_.body.sync = sync; kfcMsg_.body.batchA = batchA; kfcMsg_.body.batchB = batchB; kfcMsg_.body.matrixStrideA = matrixStrideA; kfcMsg_.body.matrixStrideB = matrixStrideB; kfcMsg_.body.matrixStrideC = matrixStrideC; kfcMsg_.body.setBatch = 1; GM_ADDR gmDataAddr = reinterpret_cast(kfcMsg_.body.cAddr); *((__gm__ uint64_t*)mmCntAddr_) = 0; GlobalTensor mmCntGlobal; mmCntGlobal.SetGlobalBuffer((__gm__ uint64_t*)mmCntAddr_); DataCacheCleanAndInvalid(mmCntGlobal); PostMessage(); if constexpr (sync) { WaitEvent(this->devEvtID); CopyToUB(ubCmatrix, gmDataAddr, ubCmatrix.GetSize()); } isSyncGetC = sync; TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); } template __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) { if constexpr (!MM_CFG.isNBatch) { return; } TRACE_START(TraceId::KFC_CLIENT_POST_MSG); cntIter_ = 0; cOffset_ = 0; curProcess = 0; ASSERT(kfcMsg_.body.isFirstIter == 1); ASSERT(cacheWorkspaceAddr); kfcMsg_.body.cAddr = reinterpret_cast(cacheWorkspaceAddr); kfcMsg_.body.enSequentialWrite = enSequentialWrite; kfcMsg_.body.sync = sync; kfcMsg_.body.batchLoop = batchLoop; kfcMsg_.body.batchA = batchA; kfcMsg_.body.batchB = batchB; kfcMsg_.body.matrixStrideA = matrixStrideA; kfcMsg_.body.matrixStrideB = matrixStrideB; kfcMsg_.body.matrixStrideC = matrixStrideC; kfcMsg_.body.setBatch = 1; kfcMsg_.body.waitIterateBatch = waitIterateBatch; *((__gm__ uint64_t*)mmCntAddr_) = 0; GlobalTensor mmCntGlobal; mmCntGlobal.SetGlobalBuffer((__gm__ uint64_t*)mmCntAddr_); DataCacheCleanAndInvalid(mmCntGlobal); PostMessage(); if constexpr (sync) { WaitEvent(this->devEvtID); } isSyncGetC = sync; TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); } // Synchronous interface. The user sends the GM address, which contains 64 bits. template __aicore__ inline void GetTensorC(const GlobalTensor& c, uint8_t enAtomic = 0, bool enSequentialWrite = false) { TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); ASSERT(kfcMsg_.body.isFirstIter == 0); ASSERT(isSyncGetC); // The mode must be synchronous. kfcMsg_.body.cAddr = reinterpret_cast(c.GetPhyAddr()); kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); kfcMsg_.body.enSequentialWrite = enSequentialWrite; kfcMsg_.body.sync = sync; PostMessage(); if constexpr (sync) { WaitEvent(this->devEvtID); } TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); } template __aicore__ inline void GetTensorC(const GlobalTensor &c, const LocalTensor &cLocal, uint8_t enAtomic = 0, bool enSequentialWrite = false) { TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); ASSERT(kfcMsg_.body.isFirstIter == 0); ASSERT(isSyncGetC); // 必须是同步模式 kfcMsg_.body.cAddr = reinterpret_cast(c.GetPhyAddr()); kfcMsg_.body.enAtomic = (uint8_t)enAtomic; kfcMsg_.body.enSequentialWrite = enSequentialWrite; kfcMsg_.body.sync = sync; PostMessage(); if constexpr (sync) { WaitEvent(this->devEvtID); } CopyToUB(cLocal, c.GetPhyAddr(), cLocal.GetSize()); TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); } // Synchronous interface template __aicore__ inline void GetTensorC(const LocalTensor& c, uint8_t enAtomic = 0, bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, uint32_t dstGap = 0) { TRACE_START(TraceId::KFC_CLIENT_REV_MSG_UB); ASSERT(kfcMsg_.body.isFirstIter == 0); if (!isSyncGetC) { // Asynchronous ASSERT(cacheWorkspaceAddr); ASSERT(enAtomic == 0); if (curProcess < INC_PROCESS_CHECK) { ++curProcess; WaitEvent(this->devEvtID); } uint32_t size; if constexpr (MM_CFG.baseMN != 0) { size = MM_CFG.baseMN * sizeof(typename C_TYPE::T); } else { size = tiling->baseM * tiling->baseN * sizeof(typename C_TYPE::T); } if constexpr (doPad) { CopyToUBPad(c, cacheWorkspaceAddr + cOffset_, height, width, srcGap, dstGap); } else { CopyToUB(c, cacheWorkspaceAddr + cOffset_, c.GetSize()); } cOffset_ += size; TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_UB); return; } ASSERT(sync == true); // must be the same as Iterate. ASSERT(enAtomic == 0); kfcMsg_.body.cAddr = GetGlobalAddr(c); kfcMsg_.body.sync = 1; kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); kfcMsg_.body.enSequentialWrite = enSequentialWrite; GM_ADDR gmDataAddr = reinterpret_cast(kfcMsg_.body.cAddr); PostMessage(); WaitEvent(this->devEvtID); if constexpr (PhyPosIsUB(C_TYPE::pos)) { if constexpr (doPad) { CopyToUBPad(c, (__gm__ DstT*)gmDataAddr, height, width); } else { CopyToUB(c, (__gm__ DstT*)gmDataAddr, c.GetSize()); } } TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_UB); return; } template __aicore__ inline GlobalTensor GetTensorC(uint8_t enAtomic = 0, bool enSequentialWrite = false) { TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); ASSERT(kfcMsg_.body.isFirstIter == 0); ASSERT(!isSyncGetC); // Asynchronous only ASSERT(cacheWorkspaceAddr); if (curProcess < INC_PROCESS_CHECK) { ++curProcess; WaitEvent(this->devEvtID); } uint32_t size; GlobalTensor global; if constexpr (MM_CFG.baseMN != 0) { size = MM_CFG.baseMN * sizeof(typename C_TYPE::T); global.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(cacheWorkspaceAddr + cOffset_), MM_CFG.baseMN); } else { size = tiling->baseM * tiling->baseN * sizeof(typename C_TYPE::T); global.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(cacheWorkspaceAddr + cOffset_), tiling->baseM * tiling->baseN); } cOffset_ += size; TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); return global; } template __aicore__ inline GlobalTensor GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) { GlobalTensor global; if constexpr (!MM_CFG.isNBatch) { return global; } TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); ASSERT(kfcMsg_.body.isFirstIter == 0); ASSERT(!isSyncGetC); // 仅支持异步 ASSERT(cacheWorkspaceAddr); if (curProcess < INC_PROCESS_CHECK) { ++curProcess; WaitEvent(this->devEvtID); } uint32_t batch = batchA > batchB ? batchA : batchB; uint32_t size = batch * tiling->singleCoreM * tiling->singleCoreN * sizeof(typename C_TYPE::T); global.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(cacheWorkspaceAddr + cOffset_), batch * tiling->singleCoreM * tiling->singleCoreN); cOffset_ += size; TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); return global; } // 配合IterateNBatch使用，获取单次IterateBatch的结果 template __aicore__ inline void GetBatchC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) { if constexpr (!MM_CFG.isNBatch) { return; } TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); ASSERT(kfcMsg_.body.isFirstIter == 0); ASSERT(cacheWorkspaceAddr); ASSERT(enSequentialWrite); ASSERT(!isSyncGetC); // 仅支持异步 if (curProcess < INC_PROCESS_CHECK) { ++curProcess; WaitEvent(this->devEvtID); } uint32_t batch = batchA > batchB ? batchA : batchB; uint32_t size = batch * tiling->singleCoreM * tiling->singleCoreN * sizeof(typename C_TYPE::T); CopyToUB(c, cacheWorkspaceAddr + cOffset_, c.GetSize()); cOffset_ += size; TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); } __aicore__ inline void AsyncGetTensorC(const LocalTensor& c) { TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); ASSERT(kfcMsg_.body.isFirstIter == 0); ASSERT(!isSyncGetC); ASSERT(cacheWorkspaceAddr); if (curProcess < INC_PROCESS_CHECK) { ++curProcess; WaitEvent(this->devEvtID); } uint32_t size = tiling->baseM * tiling->baseN * sizeof(typename C_TYPE::T); CopyToUB(c, cacheWorkspaceAddr + cOffset_, c.GetSize()); cOffset_ += size; TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); return; } __aicore__ inline void WaitGetTensorC() { event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); SetFlag(eventID); WaitFlag(eventID); } template __aicore__ inline MatrixOffset GetOffsetC() { if constexpr (isTurnOnDebug) { static_assert(!isTurnOnDebug, "unsupported!"); } } __aicore__ inline void SetLocalWorkspace(const LocalTensor& tmpBuffer) {}; #if ASCENDC_CPU_DEBUG public: // this is useless code just for cpu debug typename MatmulInstAux(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::MATMUL mm; #endif private: __gm__ KfcMsg* mmCntAddr_; GM_ADDR cacheWorkspaceAddr; // Multiple instances with only one message queue maintained. // Use shared memory to get the queue. KfcCommClient* client; TPipe* tpipe; TCubeTiling* tiling; KfcMsg kfcMsg_; bool isSyncGetC; uint16_t devEvtID; uint16_t instIdx; uint16_t curProcess; uint32_t mIter_; uint32_t nIter_; uint32_t cntIter_; uint32_t mnIter_; uint64_t cOffset_; template friend __aicore__ inline void InitKfcClient(T& mm, U* tiling, TPipe* tpipe, KfcCommClient* client, int instIdx, GM_ADDR workspace); private: __aicore__ inline void InitStatic(const TCubeTiling* tiling) { ASSERT(sizeof(KfcMsg) % CACHE_LINE_SIZE == 0); ASSERT(tiling != nullptr && "tiling cannot be nullptr when init matmul client"); ASSERT(sizeof(TCubeTiling) % sizeof(uint64_t) == 0); this->tiling = const_cast(tiling); *((uint64_t*)&kfcMsg_) = 0; *((uint64_t*)&(kfcMsg_.body)) = 0; nIter_ = ConstCeil(tiling->singleCoreN, tiling->baseN); mIter_ = ConstCeil(tiling->singleCoreM, tiling->baseM); mnIter_ = nIter_ * mIter_; cacheWorkspaceAddr = nullptr; } template __aicore__ inline uint64_t CopyGlobalAddr(GM_ADDR& gmDataAddr, const LocalTensor& data) { event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); SetFlag(eventID); WaitFlag(eventID); struct DataCopyParams param; param.blockLen = data.GetSize() / AscendCUtils::GetC0Count(sizeof(T)); GlobalTensor globalTensor; globalTensor.SetGlobalBuffer((__gm__ T*)gmDataAddr); DataCopy(globalTensor, data, param); return reinterpret_cast(gmDataAddr); } template __aicore__ inline uint64_t GetGlobalAddr( const LocalTensor& data) { uint64_t size = Ceil(data.GetSize() * sizeof(T), ONE_BLK_SIZE) * ONE_BLK_SIZE; auto gmDataAddr = client->AllocUB(size, kfcMsg_.ubAddr); if constexpr (isCopy) { return CopyGlobalAddr(gmDataAddr, data); } return reinterpret_cast(gmDataAddr); } template __aicore__ inline uint64_t GetTscmAddr(const LocalTensor& data) { #if ASCENDC_CPU_DEBUG ASSERT(GetTPipePtr() != nullptr && "tpipe cannot be nullptr when matmul client post msg"); return GetAbsAddr(GetTPipePtr(), data); #else return (uint64_t)data.GetPhyAddr(); #endif } template __aicore__ inline void PostMessage() { kfcMsg_.head = KfcMsgMakeFlag(funID, this->instIdx); auto msg = client->AllocMessage(); ASSERT(msg != nullptr && "msg cannot be nullptr when matmul client post msg"); auto tmp1 = reinterpret_cast<__ubuf__ uint64_t*>(client->ubMsg); auto tmp2 = reinterpret_cast(&kfcMsg_); for (int i = 0; i < sizeof(kfcMsg_) / sizeof(uint64_t); i++, tmp1++, tmp2++) { *tmp1 = *tmp2; } client->PostMessage(msg); // clear flag *((uint32_t*)&kfcMsg_.body) = 0; // Clear all flag bits. kfcMsg_.ubAddr = -1; } // height width in unit of element template __aicore__ inline void CopyToUBPad(const LocalTensor& data, const __gm__ U* addr, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, uint32_t dstGap = 0) { ASSERT(C_TYPE::format == CubeFormat::ND_ALIGN && "Only support padding in ND_ALIGN mode, please check template param of GetTensorC."); DataCopyParams copyParams{ static_cast(height), static_cast(width * sizeof(T)), static_cast(srcGap), static_cast(dstGap) }; DataCopyPadParams padParams{ true, 0, static_cast( ConstCeil(width, AscendCUtils::GetC0Count(sizeof(T))) * AscendCUtils::GetC0Count(sizeof(T)) - width), 0 }; GlobalTensor globalTensor; globalTensor.SetGlobalBuffer((__gm__ T*)addr); DataCopyPad(data, globalTensor, copyParams, padParams); if constexpr (sync) { event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); SetFlag(eventID); WaitFlag(eventID); } } template __aicore__ inline void CopyToUB(const LocalTensor& data, const __gm__ U* addr, uint32_t size) { struct DataCopyParams repeatParams; repeatParams.blockLen = size / AscendCUtils::GetC0Count(sizeof(T)); GlobalTensor globalTensor; globalTensor.SetGlobalBuffer((__gm__ T*)addr); if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { int32_t batchNum = 1; int32_t offset = 0; if constexpr (C_TYPE::layout != LayoutMode::NONE) { int32_t alignedSingleCoreN = ConstCeil(tiling->singleCoreN, AscendCUtils::GetC0Count(sizeof(T))) * AscendCUtils::GetC0Count(sizeof(T)); offset = tiling->singleCoreM * alignedSingleCoreN; batchNum = size / offset; } for (int32_t idx = 0; idx < batchNum; ++idx) { DataCopyParams copyParams{ static_cast(tiling->singleCoreM), static_cast(tiling->singleCoreN * sizeof(T)), 0, 0 }; DataCopyPadParams padParams{ true, 0, static_cast(ConstCeil(tiling->singleCoreN, AscendCUtils::GetC0Count(sizeof(T))) * AscendCUtils::GetC0Count(sizeof(T)) - tiling->singleCoreN), 0 }; DataCopyPad(data[idx * offset], globalTensor[idx * offset], copyParams, padParams); } } else { DataCopy(data, globalTensor, repeatParams); } if constexpr (sync) { event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); SetFlag(eventID); WaitFlag(eventID); } } template __aicore__ inline uint64_t GetGMAddrAndCopyUB(const __gm__ T* gmDataAddr, const LocalTensor& data) { event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); SetFlag(eventID); WaitFlag(eventID); struct DataCopyParams param; param.blockLen = data.GetSize() / AscendCUtils::GetC0Count(sizeof(T)); GlobalTensor globalTensor; globalTensor.SetGlobalBuffer((__gm__ T*)gmDataAddr); DataCopy(globalTensor, data, param); return reinterpret_cast(gmDataAddr); } }; } // namespace matmul #endif