/** * Copyright (c) 2024 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ /*! * \file matmul_server.h * \brief */ #ifndef __MATMUL_SERVER_H__ #define __MATMUL_SERVER_H__ #include "lib/matmul/matmul.h" #include "kernel_operator.h" namespace matmul { using namespace AscendC; template struct IBShareCache { __aicore__ inline IBShareCache() {}; }; template <> struct IBShareCache { __aicore__ inline IBShareCache() {}; using ShareCache = uint16_t; }; template <> struct IBShareCache { __aicore__ inline IBShareCache() {}; using ShareCache = GlobalCache; }; template __aicore__ constexpr bool IsIBShare() { if (A_TYPE::ibShare == true) { return true; } if (B_TYPE::ibShare == true) { return true; } return false; } struct MatmulMsg { uint32_t setOrgShape : 1; uint32_t orgM; uint32_t orgN; uint32_t orgKa; uint32_t orgKb; uint32_t orgKc; }; struct ShareMatmulBase { __aicore__ inline ShareMatmulBase() {}; }; struct ShareMatmul : ShareMatmulBase { __aicore__ inline ShareMatmul(){}; MatmulMsg msg0; MatmulMsg msg1; }; template struct ShareMatmulAux { __aicore__ inline ShareMatmulAux(){}; }; template <> struct ShareMatmulAux { __aicore__ inline ShareMatmulAux(){}; using MSG = ShareMatmulBase; }; template <> struct ShareMatmulAux { __aicore__ inline ShareMatmulAux(){}; using MSG = ShareMatmul; }; template > class MatmulService { using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; public: __aicore__ inline MatmulService() {} __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KfcCommServer* kfc, int32_t instID, GM_ADDR workspace) { ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when init kfc matmul server"); ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server"); ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server"); ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server"); this->instID = instID; this->kfcCommSrv = kfc; this->tpipe = tpipe; this->workspace = workspace; mul.SetSubBlockIdx(kfcCommSrv->subBlockID); if constexpr (!MM_CFG.enableInit) { msgAux.msg0.setOrgShape = false; msgAux.msg1.setOrgShape = false; } this->devEvtID = instID; if constexpr (A_TYPE::ibShare == true || B_TYPE::ibShare == true) { if (kfcCommSrv->subBlockID == 0) { if (tiling) { tiling_ = (TCubeTiling *)tiling; gCache.template Init(tiling_, tpipe); } } } if (tiling) { tiling_ = (TCubeTiling *)tiling; mul.Init(tiling_, tpipe); } } __aicore__ inline void Init(__gm__ KfcMsg* msg) { if constexpr (!MM_CFG.enableInit) { return; } else { ASSERT(msg != nullptr && "msg cannot be nullptr when init matmul server"); ASSERT(msg->tilingInfo.tilingAddr != nullptr && "tiling cannot be nullptr when init matmul server"); auto temp1 = ((__gm__ uint32_t*)(msg->tilingInfo.tilingAddr)); tiling_ = &tmpTiling_; auto temp2 = (uint32_t*)tiling_; constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE; GlobalTensor tilingGlobal; for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) { Barrier(); tilingGlobal.SetGlobalBuffer((__gm__ int64_t *)(msg->tilingInfo.tilingAddr + i)); DataCacheCleanAndInvalid(tilingGlobal); } for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, temp1++, temp2++) { *temp2 = *temp1; } mul.Init(this->tiling_, this->tpipe); } } __aicore__ inline void SetSubBlockIdx(uint8_t idx) { mul.SetSubBlockIdx(idx); } __aicore__ inline void SetOrgShape(__gm__ KfcMsg* msg) { if constexpr (!MM_CFG.enableInit) { if (mul.GetSubBlockIdx() == 0) { msgAux.msg0.orgM = msg->orgShape.orgM; msgAux.msg0.orgN = msg->orgShape.orgN; msgAux.msg0.orgKa = msg->orgShape.orgKa; msgAux.msg0.orgKb = msg->orgShape.orgKb; msgAux.msg0.orgKc = msg->orgShape.orgKc; msgAux.msg0.setOrgShape = true; } else { msgAux.msg1.orgM = msg->orgShape.orgM; msgAux.msg1.orgN = msg->orgShape.orgN; msgAux.msg1.orgKa = msg->orgShape.orgKa; msgAux.msg1.orgKb = msg->orgShape.orgKb; msgAux.msg1.orgKc = msg->orgShape.orgKc; msgAux.msg1.setOrgShape = true; } } else { mul.SetOrgShape(msg->orgShape.orgM, msg->orgShape.orgN, msg->orgShape.orgKa, msg->orgShape.orgKb, msg->orgShape.orgKc); } } __aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg) { if (msg->body.setTail) { mul.SetSingleShape(msg->body.singleM, msg->body.singleN, msg->body.singleK); } } __aicore__ inline void SetTail(__gm__ KfcMsg* msg) { if (msg->body.setTail) { mul.SetTail(msg->body.singleM, msg->body.singleN, msg->body.singleK); } } __aicore__ inline void SetHF32(__gm__ KfcMsg* msg) { mul.SetHF32(static_cast(msg->body.enHF32), static_cast(msg->body.hf32TransMode)); } __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg) { if (!msg->body.setTensorA) return; if constexpr (A_TYPE::format == CubeFormat::SCALAR) { SrcT scalar; auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr)); auto temp2 = (uint8_t*)&scalar; for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { *temp2 = *temp1; } mul.SetTensorA(scalar); return; } const uint64_t size = (uint64_t)(msg->body.sizeAmatrix); if constexpr (PhyPosIsL1(A_TYPE::pos)) { const auto& scmLocal = GetTscmTensor(msg->body.aAddr, size); mul.SetTensorA(scmLocal, msg->body.isTransA); } else { GlobalTensor aGlobal; aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr), size); mul.SetTensorA(aGlobal, msg->body.isTransA); } } __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) { if (!msg->body.setTensorA) { return; } if constexpr (A_TYPE::format == CubeFormat::SCALAR) { SrcT scalar; auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr) + offset); auto temp2 = (uint8_t*)&scalar; for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { *temp2 = *temp1; } mul.SetTensorA(scalar); return; } if constexpr (PhyPosIsL1(A_TYPE::pos)) { const auto& scmLocal = GetTscmTensor(msg->body.aAddr + offset, size); mul.SetTensorA(scmLocal, msg->body.isTransA); } else { GlobalTensor aGlobal; aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr + offset), size); mul.SetTensorA(aGlobal, msg->body.isTransA); } } __aicore__ inline void SetQuantVector(__gm__ KfcMsg* msg) { if (!msg->body.setQuant) { return; } int quantMode = msg->body.quantMode; if (quantMode == 1) { uint64_t quantScalar = msg->body.quantScalar; mul.SetQuantScalar(quantScalar); } else if (quantMode == 2) { const uint64_t size = static_cast(msg->body.quantSize); GlobalTensor quantGlobal; quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(msg->body.quantAddr), size); mul.SetQuantVector(quantGlobal); } } __aicore__ inline void SetBatchNum(__gm__ KfcMsg* msg) { if constexpr (A_TYPE::layout == LayoutMode::NONE) { return; } if (!msg->body.setBatch) { return; } mul.SetBatchNum(msg->body.batchA, msg->body.batchB); } __aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg) { GlobalTensor msgGlobal; msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); DataCacheCleanAndInvalid(msgGlobal); mul.SetSelfDefineData(msg->body.dataPtr); } __aicore__ inline void SetUserDefInfo(__gm__ KfcMsg* msg) { mul.SetUserDefInfo(msg->userDefInfo.tilingPtr); } __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg) { if (!msg->body.setTensorB) return; if constexpr (B_TYPE::format == CubeFormat::SCALAR) { SrcT scalar; auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr)); auto temp2 = (uint8_t*)&scalar; for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { *temp2 = *temp1; } mul.SetTensorB(scalar); return; } const uint64_t size = (uint64_t)(msg->body.sizeBmatrix); if constexpr (PhyPosIsL1(B_TYPE::pos)) { const auto& scmLocal = GetTscmTensor(msg->body.bAddr, size); mul.SetTensorB(scmLocal, msg->body.isTransB); } else { GlobalTensor bGlobal; bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr), size); mul.SetTensorB(bGlobal, msg->body.isTransB); } } __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) { if (!msg->body.setTensorB) { return; } if constexpr (B_TYPE::format == CubeFormat::SCALAR) { SrcT scalar; auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr) + offset); auto temp2 = (uint8_t*)&scalar; for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { *temp2 = *temp1; } mul.SetTensorB(scalar); return; } if constexpr (PhyPosIsL1(B_TYPE::pos)) { const auto& scmLocal = GetTscmTensor(msg->body.bAddr + offset, size); mul.SetTensorB(scmLocal, msg->body.isTransB); } else { GlobalTensor bGlobal; bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr + offset), size); mul.SetTensorB(bGlobal, msg->body.isTransB); } } __aicore__ inline void SetBias(__gm__ KfcMsg* msg) { if (msg->body.setTensorBias) { const uint64_t size = (uint64_t)tiling_->singleCoreN; if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { const auto& scmLocal = GetTscmTensor(msg->body.biasAddr, size); mul.SetBias(scmLocal); } else { GlobalTensor biasGlobal; biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr), size); mul.SetBias(biasGlobal); } } else if (msg->body.setClearBias) { mul.ClearBias(); } } __aicore__ inline void SetBias(__gm__ KfcMsg* msg, const uint64_t offset) { if (msg->body.setTensorBias) { const uint64_t size = (uint64_t)tiling_->singleCoreN; if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { const auto& scmLocal = GetTscmTensor(msg->body.biasAddr + offset, size); mul.SetBias(scmLocal); } else { GlobalTensor biasGlobal; biasGlobal.SetGlobalBuffer( reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr + offset), size); mul.SetBias(biasGlobal); } } else if (msg->body.setClearBias) { mul.ClearBias(); } } __aicore__ inline bool GetTensorC(__gm__ KfcMsg* msg) { if constexpr (A_TYPE::layout != LayoutMode::NONE) { return true; } uint64_t size; if constexpr (MM_CFG.baseMN != 0) { size = MM_CFG.baseMN; } else { size = tiling_->baseM * tiling_->baseN; } if constexpr (PhyPosIsL1(C_TYPE::pos)) { const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); mul.GetTensorC(scmLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); } else { GlobalTensor cGlobal; cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); mul.GetTensorC(cGlobal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); } // Now release UB if constexpr (PhyPosIsUB(C_TYPE::pos)) { if (unlikely(msg->ubAddr >= 0)) { kfcCommSrv->FreeUB(msg->ubAddr); } } if (msg->body.sync == 1) { // Synchronize uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); NotifyEvent(eventID); } return false; } __aicore__ inline uint16_t GetInstID() { return instID; } __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg) { if constexpr (!MM_CFG.enableInit) { if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) { mul.SetOrgShape(msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa, msgAux.msg0.orgKb, msgAux.msg0.orgKc); } else if (mul.GetSubBlockIdx() == 1 && msgAux.msg1.setOrgShape) { mul.SetOrgShape(msgAux.msg1.orgM, msgAux.msg1.orgN, msgAux.msg1.orgKa, msgAux.msg1.orgKb, msgAux.msg1.orgKc); } } if (msg->body.isFirstIter) { SetTensorA(msg); SetTensorB(msg); if constexpr (MM_CFG.enableSetBias) { SetBias(msg); } if constexpr (MM_CFG.enableSetTail) { SetTail(msg); } if constexpr (MM_CFG.enableQuantVector) { SetQuantVector(msg); } if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) || ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) { SetBatchNum(msg); } if constexpr (MM_CFG.enableSetDefineData) { SetSelfDefineData(msg); } } } __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg, const uint64_t batchASize, const uint64_t batchBSize, const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0) { if (msg->body.isFirstIter) { SetTensorA(msg, batchASize, offsetA); SetTensorB(msg, batchBSize, offsetB); SetBias(msg, offsetBias); SetTail(msg); SetQuantVector(msg); SetBatchNum(msg); } } __aicore__ inline bool IterateBatch(__gm__ KfcMsg* msg) { if constexpr (A_TYPE::layout == LayoutMode::NONE) { return true; } // In the batch scenario, messages occupy 128 bytes. After the update, messages occupy 64 bytes. GlobalTensor msgGlobal; msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); DataCacheCleanAndInvalid(msgGlobal); ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in Batch MM IterateSetMessage(msg); uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN; GlobalTensor cGlobal; cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite, msg->body.matrixStrideA, msg->body.matrixStrideB, msg->body.matrixStrideC); // Now release UB if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { if (unlikely(msg->ubAddr >= 0)) { kfcCommSrv->FreeUB(msg->ubAddr); } } if (msg->body.sync || msg->body.waitIterateBatch) { uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); NotifyEvent(eventID); } return true; } __aicore__ inline bool IterateNBatch(__gm__ KfcMsg* msg) { if constexpr (!MM_CFG.isNBatch) { return true; } GlobalTensor msgGlobal; msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); DataCacheCleanAndInvalid(msgGlobal); ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in Batch MM const uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN; const uint64_t singleBatchASize = (uint64_t)(msg->body.sizeAmatrix) / msg->body.batchLoop; uint64_t batchAOffset = tiling_->ALayoutInfoD * msg->body.batchA; if constexpr (A_TYPE::layout != LayoutMode::SBNGD) { batchAOffset = batchAOffset * tiling_->ALayoutInfoS; } const uint64_t singleBatchBSize = (uint64_t)(msg->body.sizeBmatrix) / msg->body.batchLoop; uint64_t batchBOffset = tiling_->BLayoutInfoD * msg->body.batchB; if constexpr (B_TYPE::layout != LayoutMode::SBNGD) { batchBOffset = batchBOffset * tiling_->BLayoutInfoS; } const uint64_t batchCOffset = tiling_->CLayoutInfoS2; const uint32_t batchC = msg->body.batchA > msg->body.batchB ? msg->body.batchA : msg->body.batchB; bool layoutGCondition = tiling_->CLayoutInfoG == 1 && (tiling_->BLayoutInfoG != 1 || tiling_->ALayoutInfoG != 1); int32_t layoutG = tiling_->BLayoutInfoG > tiling_->ALayoutInfoG ? tiling_->BLayoutInfoG : tiling_->ALayoutInfoG; int32_t batchOffsetBias = tiling_->CLayoutInfoS2 * batchC; if (layoutGCondition) { batchOffsetBias = batchOffsetBias / layoutG; } int32_t batchOffsetC = batchOffsetBias * sizeof(typename C_TYPE::T); if constexpr (C_TYPE::layout != LayoutMode::SBNGD) { batchOffsetC = batchOffsetC * tiling_->CLayoutInfoS1; } uint64_t offset = 0; uint32_t cntIterator = 0; for (uint32_t loopIdx = 0U; loopIdx < msg->body.batchLoop; loopIdx++) { const uint64_t aOffset = batchAOffset * loopIdx * sizeof(typename A_TYPE::T); const uint64_t bOffset = batchBOffset * loopIdx * sizeof(typename B_TYPE::T); const uint64_t biasOffset = batchOffsetBias * loopIdx * sizeof(typename BIAS_TYPE::T); IterateSetMessage(msg, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset); GlobalTensor cGlobal; cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr + offset), size); mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite, msg->body.matrixStrideA, msg->body.matrixStrideB, msg->body.matrixStrideC); cntIterator++; if (cntIterator < INC_PROCESS_CHECK && (!msg->body.sync && !msg->body.waitIterateBatch)) { uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); NotifyEvent(eventID); } offset += batchOffsetC; } // Now release UB if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { if (unlikely(msg->ubAddr >= 0)) { kfcCommSrv->FreeUB(msg->ubAddr); } } uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); if (msg->body.sync || msg->body.waitIterateBatch) { NotifyEvent(eventID); } else if (cntIterator >= INC_PROCESS_CHECK) { NotifyEvent(eventID); } return true; } __aicore__ inline bool Iterate(__gm__ KfcMsg* msg, KFC_Enum funID) { if constexpr (A_TYPE::layout != LayoutMode::NONE) { return true; } if constexpr ((A_TYPE::ibShare == true) || (B_TYPE::ibShare == true)) { if (msg->body.iterateFakeMsg) { if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { // fake msg uint16_t eventID = static_cast(this->devEvtID * 2 + kfcCommSrv->subBlockID); NotifyEvent(eventID); return true; } } } else { ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg."); } if constexpr ((IsSameType::value && IsSameType::value) || ((IsSameType::value || IsSameType::value) && IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value))) { GlobalTensor msgGlobal; msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); DataCacheCleanAndInvalid(msgGlobal); } IterateSetMessage(msg); uint64_t size; if constexpr (MM_CFG.singleCoreMN != 0) { size = MM_CFG.singleCoreMN; } else { size = tiling_->singleCoreM * tiling_->singleCoreN; } GlobalTensor cGlobal; cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); uint64_t offset = 0; uint64_t offsetSize = 0; auto enSequentialWrite = msg->body.enSequentialWrite; auto enAtomic = msg->body.enAtomic; auto sync = msg->body.sync; auto enPartialSum = msg->body.enPartialSum; if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) == 0) { ASSERT(msg->body.cAddr != 0); // The output address must be configured. if constexpr (MM_CFG.baseMN != 0) { offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0; } else { offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0; } } else { if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { ASSERT(msg->body.cAddr != 0); // The output address must be configured. if constexpr (MM_CFG.baseMN != 0) { offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0; } else { offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0; } } else if (sync == 0) { // For asynchronous Iterate, the offset must be used for address calculation and // the size is baseM x baseN. if constexpr (MM_CFG.baseMN != 0) { offsetSize = MM_CFG.baseMN; } else { offsetSize = tiling_->baseM * tiling_->baseN; } enSequentialWrite = 1; } } uint32_t cntIterator = 0; TRACE_START(TraceId::MatMul_CALC); // Asynchronous and configure the workspace while (mul.Iterate(enPartialSum)) { if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { if (unlikely(cntIterator == 0)) { if (unlikely(funID == KFC_Enum::MMFUN_ITERATE && sync == 1)) { TRACE_STOP(TraceId::MatMul_CALC); return false; // The queue is not switched, and no message needs to be returned. } } } if constexpr (PhyPosIsL1(C_TYPE::pos)) { mul.GetTensorC(scmLocal[offset], (uint8_t)(enAtomic), enSequentialWrite); } else { mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite); } cntIterator++; if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { if (cntIterator < INC_PROCESS_CHECK) { if (funID == KFC_Enum::MMFUN_ITERATE) { uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); NotifyEvent(eventID); } } } offset += offsetSize; } // Now release UB if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { if (unlikely(msg->ubAddr >= 0)) { kfcCommSrv->FreeUB(msg->ubAddr); } } uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); if (sync || msg->body.waitIterateAll) { ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL); NotifyEvent(eventID); } else if (cntIterator >= INC_PROCESS_CHECK && funID == KFC_Enum::MMFUN_ITERATE) { NotifyEvent(eventID); } mul.End(); TRACE_STOP(TraceId::MatMul_CALC); return true; } __aicore__ inline bool IsSharedMatmul() { if constexpr (MM_CFG.enableInit) { return false; } else { return true; } } __aicore__ inline bool ProcessIbShareSync(KFC_Enum funID, bool& freeMsg, int &lastMsgId, const int subBlockID) { if constexpr ((A_TYPE::ibShare == true) || (B_TYPE::ibShare == true)) { if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { if (lastMsgId == subBlockID) { freeMsg = false; return true; } lastMsgId = subBlockID; return false; } return false; } else { return false; } } __aicore__ inline bool Process(__gm__ KfcMsg* msg, KFC_Enum funID) { if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) || ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) { if ((static_cast(funID) & static_cast(KFC_Enum::MMFUN_MASK)) == static_cast(KFC_Enum::MMFUN_MASK)) { return Iterate(msg, funID); } } if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0)) { if (funID == KFC_Enum::MMFUN_ITERATE_BATCH_ALL) { return IterateBatch(msg); } } if constexpr (MM_CFG.enableEnd) { if (funID == KFC_Enum::MMFUN_END) { mul.End(); } } if constexpr (MM_CFG.enableGetTensorC) { if (funID == KFC_Enum::MMFUN_GET_TENSOR_C) { return GetTensorC(msg); } } if constexpr (MM_CFG.enableSetOrgShape) { if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) { SetOrgShape(msg); return true; } } if constexpr (MM_CFG.enableInit) { if (funID == KFC_Enum::MMFUN_INIT) { Init(msg); return true; } } if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) { if (funID == KFC_Enum::MMFUN_ITERATE_N_BATCH_ALL) { return IterateNBatch(msg); } } if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) { SetUserDefInfo(msg); return true; } if (funID == KFC_Enum::MMFUN_SET_HF32) { SetHF32(msg); return true; } ASSERT("illegal function ID."); return true; } template __aicore__ LocalTensor GetTscmTensor(uint64_t addr, const uint64_t size) { LocalTensor scmLocal; TBuffAddr scmTbuf; scmTbuf.logicPos = (uint8_t)(TPosition::TSCM); scmTbuf.dataLen = size * sizeof(DstT); scmTbuf.bufferAddr = addr; #if ASCENDC_CPU_DEBUG scmTbuf.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(TPosition::TSCM)) + addr; #endif scmLocal.SetAddr(scmTbuf); return scmLocal; } private: MatmulImpl mul; GM_ADDR workspace; KfcCommServer* kfcCommSrv; TPipe* tpipe; TCubeTiling* tiling_; TCubeTiling tmpTiling_; // for compatible with init interface typename IBShareCache()>::ShareCache gCache; typename ShareMatmulAux::MSG msgAux; uint16_t instID; uint16_t devEvtID; }; } // namespace matmul #endif // __MATMUL_SERVER_H__