/** * Copyright (c) 2024 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ /*! * \file matmul.h * \brief */ #ifndef __MATMUL_MATMUL_H__ #define __MATMUL_MATMUL_H__ #include #include "lib/matmul/tiling.h" #include "../../impl/matmul/matmul_macro_v220_impl.h" #include "../../impl//matmul/matmul_macro_v220_basic_impl.h" #include "../../impl//matmul/matmul_macro_v200_impl.h" #include "../../impl/matmul/matmul_utils.h" #include "lib/matmul/matmul_call_back.h" namespace matmul { class GlobalCache; } __BLOCK_LOCAL__ __inline__ matmul::GlobalCache* gL1Cache; __aicore__ inline matmul::GlobalCache* GetGlobalCachePtr() { return gL1Cache; } namespace matmul { using namespace AscendC; template __aicore__ inline constexpr int32_t GetC0Size() { if (sizeof(SrcT) == sizeof(float)) { return 8; } else if (sizeof(SrcT) == sizeof(int8_t)) { return 32; } return 16; } class GlobalCache { public: __aicore__ inline GlobalCache() = default; template __aicore__ inline void Init(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe); template __aicore__ inline void InitBuffer(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe); template __aicore__ inline bool Hit(__gm__ SrcT* gmAddr); template __aicore__ inline void EnQue(const LocalTensor& tensor); template __aicore__ inline LocalTensor DeQue(); template __aicore__ inline LocalTensor AllocTensor(); template __aicore__ inline void FreeTensor(LocalTensor& tensor); template __aicore__ inline void ClearCache(); template __aicore__ inline LocalTensor GetCacheHead(); template __aicore__ inline void SetCacheHead(LocalTensor& cacheHead); template __aicore__ inline void SetOrgAddr(__gm__ SrcT* gmAddr); __aicore__ inline GM_ADDR GetOrgAddr(); __aicore__ inline void FreeAllEvent(); __aicore__ inline int32_t GetCacheSize(); __aicore__ inline void ReduceCacheSize(); TQue cacheQue_; TBuffAddr cacheHead_; GM_ADDR srcAddr_; int32_t cacheSize_; bool alloc_; }; struct CopyGMParams { int dstOffset { 0 }; int baseUseN { 0 }; int blockCount { 0 }; int dstStride { 0 }; bool isComputeLineByLine { false }; }; struct DataCopyOutParams { __aicore__ DataCopyOutParams() { quantMode = 0; cBurstNum = 0; burstLen = 0; srcStride = 0; dstStride = 0; oriNSize = 0; enUnitFlag = false; quantScalar = 0; } __aicore__ DataCopyOutParams(const uint16_t count, const uint16_t len, const uint16_t srcStrideIn, const uint32_t dstStrideIn, const uint16_t nSize, const bool unitFlag) { cBurstNum = count; burstLen= len; srcStride = srcStrideIn; dstStride = dstStrideIn; oriNSize = nSize; enUnitFlag = unitFlag; } uint8_t quantMode = 0; uint16_t cBurstNum = 0; uint16_t burstLen = 0; uint16_t srcStride = 0; uint32_t dstStride = 0; uint16_t oriNSize = 0; bool enUnitFlag = false; uint64_t quantScalar = 0; uint64_t cbufWorkspaceAddr = 0; }; constexpr int32_t QUEUE_DEPTH = 1; constexpr int32_t NZ_MASK_VAlUE = 2; constexpr int32_t FLOAT_FACTOR = 2; constexpr int32_t B8_C0SIZE = 32; constexpr int32_t B32_C0SIZE = 8; constexpr int32_t B16_C0SIZE = 16; constexpr int32_t CTRL_46_BIT = 46; constexpr int32_t CTRL_47_BIT = 47; #if __CCE_AICORE__ < 200 constexpr int32_t DB_FACTOR = 1; #else constexpr int32_t DB_FACTOR = 2; #endif // the KFC_MESSAGE_LENGTH is 64 // the MAX_MSG_COUNT is 64 // the BIDIRECTION_NUM is 2 // the MAX_MATMUL_OBJ is 8 // the MAX_AIV_NUM is 50 // the TOTAL_UB_SIZE is 192 * 1024; for ascend910b1 // fixpipe vdeqf16 quant tensor Gm offset // the gm_offset is AllMsgLen + AllCntMsgLen + AllUBMap // equal: sizeof(KfcMsg) * 2 * MAX_MSG_COUNT * MAX_AIV_NUM + // equal: sizeof(KfcMsg) * MAX_MATMUL_OBJ * MAX_AIV_NUM + // equal: TOTAL_UB_SIZE * MAX_AIV_NUM constexpr int64_t GM_OFFSET = 128 * 2 * 64 * 50 + 128 * 8 * 50 + 192 * 1024 * 50; template struct GetDstType { using Type = T; }; template <> struct GetDstType { using Type = float; }; template <> struct GetDstType { using Type = float; }; template <> struct GetDstType { using Type = int32_t; }; #if __CCE_AICORE__ >= 220 template <> struct GetDstType { using Type = float; }; template <> struct GetDstType { using Type = int32_t; }; #endif int32_t constexpr GetNdNzMask(CubeFormat dstFormat, CubeFormat srcFormat) { if ((srcFormat == CubeFormat::ND) && (dstFormat == CubeFormat::NZ)) { return 1; } else if ((srcFormat == CubeFormat::NZ) && (dstFormat == CubeFormat::ND)) { return NZ_MASK_VAlUE; } return 0; } template constexpr static int32_t AuxGetFactor() { if (sizeof(SrcT) == sizeof(float)) { return FLOAT_FACTOR; } return 1; } template constexpr static int32_t AuxGetC0Size() { if (sizeof(SrcT) == sizeof(float)) { return B32_C0SIZE; } else if (IsSameType::value || IsSameType::value) { return B8_C0SIZE; } return B16_C0SIZE; } template struct MatmulType { constexpr static TPosition pos = POSITION; constexpr static CubeFormat format = FORMAT; using T = TYPE; constexpr static bool isTrans = ISTRANS; constexpr static LayoutMode layout = LAYOUT; constexpr static bool ibShare = IBSHARE; }; __aicore__ constexpr bool DoMatmulNorm(MatmulConfig mmCFG) { return mmCFG.doNorm; } __aicore__ constexpr bool EnUnitFlag(MatmulConfig mmCFG) { return mmCFG.enUnitFlag; } __aicore__ constexpr bool DoMatmulBasicBlock(MatmulConfig mmCFG) { return mmCFG.doBasicBlock; } __aicore__ constexpr bool DoMatmulSpecialBasicBlock(MatmulConfig mmCFG) { return mmCFG.doSpecialBasicBlock; } __aicore__ constexpr bool DoMatmulMDL(MatmulConfig mmCFG) { return mmCFG.doMultiDataLoad; } __aicore__ constexpr bool DoMatmulIBShareNorm(MatmulConfig mmCFG) { return mmCFG.doIBShareNorm; } __aicore__ constexpr bool DoMatmulSpecialMDL(MatmulConfig mmCFG) { return mmCFG.doSpecialMDL; } __aicore__ constexpr MatmulVersion GetMatmulVersion(MatmulConfig mmCFG) { if (DoMatmulNorm(mmCFG)) { return MatmulVersion::NORMAL; } else if (DoMatmulBasicBlock(mmCFG) || DoMatmulSpecialBasicBlock(mmCFG)) { return MatmulVersion::BASIC_BLOCK; } else if (DoMatmulMDL(mmCFG) || DoMatmulSpecialMDL(mmCFG)) { return MatmulVersion::MULTI_DATA_LOAD; } else if (DoMatmulIBShareNorm(mmCFG)) { return MatmulVersion::IBSHARE_NORM; } return MatmulVersion::NORMAL; } __aicore__ inline int Ceil(int num1, int num2) { ASCENDC_ASSERT((num2 > 0), { KERNEL_LOG(KERNEL_ERROR, "num2 is %d , which should be larger than 0", num2); }); return (num1 + num2 - 1) / num2; } template struct QidType { __aicore__ inline QidType() {}; }; template struct QidType { __aicore__ inline QidType() {}; using QUE = int32_t; }; template struct QidType { __aicore__ inline QidType() {}; using QUE = int32_t; }; template struct QidType { __aicore__ inline QidType() {}; using QUE = TQueBind; }; template struct QidType { __aicore__ inline QidType() {}; using QUE = TQueBind; }; /* ************************************************************************************************** * L0cType * * ************************************************************************************************* */ template struct L0cType { __aicore__ inline L0cType() {}; }; template struct L0cType { __aicore__ inline L0cType() {}; #if __CCE_AICORE__ >= 220 using BUFFER = TBuf; #else using BUFFER = TQue; #endif }; template struct L0cType { __aicore__ inline L0cType() {}; using BUFFER = TQue; }; /* ************************************************************************************************** * MatmulParamsBase * * ************************************************************************************************* */ template struct MatmulParamsBase { __aicore__ inline MatmulParamsBase() {}; }; template struct MatmulParamsNorm : public MatmulParamsBase { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulParamsNorm() {}; using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; using DstT = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; TQue qidBias_; typename L0cType::BUFFER CO1_; #if __CCE_AICORE__ < 220 TQue qidA2_; TQue qidB2_; TQue qidVecIn_; TQue qidCO2_; typename QidType::QUE qidA1_; typename QidType::QUE qidB1_; typename QidType::QUE qidA1Cache_; typename QidType::QUE qidB1Cache_; #else TQue qidA1_; TQue qidB1_; TQue qidA1Cache_; TQue qidB1Cache_; #endif LocalTensor cMatrix_; LocalTensor cacheHeadA1_; // Allocate and release using qidA1Cache_ LocalTensor cacheHeadB1_; // Allocate and release using qidB1Cache_ LocalTensor cacheHeadBias_; // Allocate and release using qidBias_ SrcT aScalar_; SrcT bScalar_; DEBUG_CODE(int calCount_ = 0); TBuffAddr leftMatrix_; TBuffAddr rightMatrix_; TBuffAddr inputBias_; __gm__ SrcT* aGlobal_; __gm__ SrcBT* bGlobal_; __gm__ BiasT* biasGlobal_; TBuf<> calcBuf_; TPipe* tpipe_; const TCubeTiling* __restrict tiling_; __gm__ uint8_t* cacheWorkspaceAddr; #if __CCE_AICORE__ < 220 __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr; LocalTensor localWorkspace; int nd2nz0ffset = 0; int transOffset = 0; int co2Offset = 0; #endif int singleCoreM_; int singleCoreN_; int singleCoreK_; // iterate nums in mnk axis int mIter_; int nIter_; int kIter_; // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases // measured in element int baseUseM_; int baseUseK_; int baseUseN_; // measured in cube block int blockUseM_; int blockUseK_; int blockUseN_; int32_t cacheProcA_, cacheProcB_; bool isFirstIter_; bool isTransposeA_; // whether A matrix need to transpose bool isTransposeB_; // whether B matrix need to transpose // whether enbale bias, default value is false bool enableBias_; int tailM_, tailK_, tailN_; // current c matrix coordinate int curM_, curN_; // current c matrix step size, there could be tail steps int curStepM_, curStepN_; // current c matrix step block coordinate int stepMIdx_, stepNIdx_; bool enHF32Mode_; int32_t hf32TransMode_; uint8_t subBlockIdx_; int baseMK_; int baseKN_; int baseMN_; int cacheA1Size_, cacheB1Size_; int depthA1_, depthB1_; uint64_t dataPtr_; uint64_t tilingPtr_; }; template struct MatmulParamsNormQuant : public MatmulParamsNorm { __aicore__ inline MatmulParamsNormQuant() {}; TQue qidFixPipe_; uint64_t quantScalar_ = 0; GlobalTensor quantTensor_; // 0: no quant, 1: deqf16, 2: vdeqf16, 3: QF322B8_PRE, 4: VQF322B8_PRE, 5: REQ8(s32->u8/s8), 6: VREQ8(s32->u8/s8) uint8_t quantMode_ = 0; }; template struct MatmulParamsMDL : public MatmulParamsBase { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulParamsMDL() {}; using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; using DstT = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; TQue qidBias_; TQue qidFixPipe_; typename L0cType::BUFFER CO1_; TQue qidA1_; TQue qidB1_; #if __CCE_AICORE__ < 220 TQue qidA2_; TQue qidB2_; TQue qidVecIn_; TQue qidCO2_; #endif LocalTensor cMatrix_; TBufHandle cacheA1BufPing_; TBufHandle cacheA1BufPong_; TBufHandle cacheB1BufPing_; TBufHandle cacheB1BufPong_; bool cacheA1IsCachingPing_; bool cacheA1IsCachingPong_; bool cacheB1IsCachingPing_; bool cacheB1IsCachingPong_; DEBUG_CODE(int calCount_ = 0); TBuffAddr leftMatrix_; TBuffAddr rightMatrix_; TBuffAddr inputBias_; __gm__ SrcT* aGlobal_; __gm__ SrcBT* bGlobal_; __gm__ BiasT* biasGlobal_; TBuf<> calcBuf_; TPipe* tpipe_; const TCubeTiling* __restrict tiling_; __gm__ uint8_t* cacheWorkspaceAddr; #if __CCE_AICORE__ < 220 __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr; LocalTensor localWorkspace; int nd2nz0ffset = 0; int transOffset = 0; int co2Offset = 0; #endif int singleCoreM_; int singleCoreN_; int singleCoreK_; // iterate nums in mnk axis int mIter_; int nIter_; int kIter_; // iterate nums in mn step axis int mStepIter_; int nStepIter_; int kaStepIter_; int kbStepIter_; int kStepIter_; int minStepK_; int kaStepFactor_; int kbStepFactor_; // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases // in unit of element int baseUseM_; int baseUseK_; int baseUseN_; // in unit of cube block int blockUseM_; int blockUseK_; int blockUseN_; // in unit of element int baseUseStepM_; int baseUseStepN_; int baseUseStepKa_; int baseUseStepKb_; // in unit of cube block int blockUseStepM_; int blockUseStepN_; int blockUseStepKa_; int blockUseStepKb_; bool isFirstIter_; bool isTransposeA_; // whether A matrix need to transpose bool isTransposeB_; // whether B matrix need to transpose // whether enbale bias, default value is false bool enableBias_; // in unit of element int tailM_, tailK_, tailN_; // in unit of element int tailStepM_, tailStepN_, tailStepKa_, tailStepKb_; // current c matrix coordinate, in unit of baseMN int curM_, curN_; // current c matrix step size, in unit of baseMNK , there could be tail steps int curStepM_, curStepN_; // current c matrix step block coordinate, in unit of stepMNK int stepMIdx_, stepNIdx_, stepKaIdx_, stepKbIdx_; // stepKa == kIter bool isA1KFullLoad_, isB1KFullLoad_; bool enHF32Mode_; int32_t hf32TransMode_; uint8_t subBlockIdx_; int baseMK_; int baseKN_; int baseMN_; int cacheA1Factor_, cacheB1Factor_; uint64_t quantScalar_ = 0; uint64_t dataPtr_; uint64_t tilingPtr_; GlobalTensor quantTensor_; // 0: no quant, 1: deqf16, 2: vdeqf16; uint8_t quantMode_ = 0; // anti quant param. SrcT antiQuantOffsetScalar_; SrcT antiQuantScaleScalar_; LocalTensor antiQuantOffsetTensor_; LocalTensor antiQuantScaleTensor_; }; template struct MatmulParamsBasicBlock : public MatmulParamsNorm { __aicore__ inline MatmulParamsBasicBlock() {}; }; template struct MatmulParamsIBShareNorm : public MatmulParamsBase { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulParamsIBShareNorm() {}; using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; TQue qidBias_; typename L0cType::BUFFER CO1_; TQue qidA2_; TQue qidB2_; TQue qidVecIn_; TQue qidCO2_; typename QidType::QUE qidA1_; typename QidType::QUE qidA1Cache_; typename QidType::QUE qidB1_; typename QidType::QUE qidB1Cache_; LocalTensor cMatrix_; LocalTensor cacheHeadA1_; // Allocate and release using qidA1Cache_ LocalTensor cacheHeadB1_; // Allocate and release using qidB1Cache_ LocalTensor cacheHeadBias_; // Allocate and release using qidBias_ SrcT aScalar_; SrcT bScalar_; DEBUG_CODE(int calCount_ = 0); TBuffAddr leftMatrix_; TBuffAddr rightMatrix_; TBuffAddr inputBias_; __gm__ SrcT* aGlobal_; __gm__ SrcT* bGlobal_; __gm__ BiasT* biasGlobal_; TBuf<> calcBuf_; TPipe* tpipe_; const TCubeTiling* __restrict tiling_; __gm__ uint8_t* cacheWorkspaceAddr; int singleCoreM_; int singleCoreN_; int singleCoreK_; // iterate nums in mnk axis int mIter_; int nIter_; int kIter_; // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases // measured in element int baseUseM_; int baseUseK_; int baseUseN_; // measured in cube block int blockUseM_; int blockUseK_; int blockUseN_; int32_t cacheProcA_, cacheProcB_; bool isFirstIter_; bool isTransposeA_; // whether A matrix need to transpose bool isTransposeB_; // whether B matrix need to transpose // whether enbale bias, default value is false bool enableBias_; int tailM_, tailK_, tailN_; // current c matrix coordinate int curM_, curN_; // current c matrix step size, there could be tail steps int curStepM_, curStepN_; // current c matrix step block coordinate int stepMIdx_, stepNIdx_; bool enHF32Mode_; int32_t hf32TransMode_; uint8_t subBlockIdx_; int baseMK_; int baseKN_; int baseMN_; int cacheA1Size_, cacheB1Size_; int depthA1_, depthB1_; uint64_t dataPtr_; uint64_t tilingPtr_; }; /* ************************************************************************************************** * MatmulParams * * ************************************************************************************************* */ template struct MatmulParams { __aicore__ inline MatmulParams(){}; }; // CFG_NORM #if __CCE_AICORE__ >= 220 template struct MatmulParams::value && IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value)))>::type> { __aicore__ inline MatmulParams(){}; using PARAMS = MatmulParamsNorm; }; #else template struct MatmulParams::value && IsSameType::value) || (IsSameType::value && IsSameType::value))>::type> { __aicore__ inline MatmulParams(){}; using PARAMS = MatmulParamsNorm; }; #endif #if __CCE_AICORE__ >= 220 template struct MatmulParams::value && IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value)))>::type> { __aicore__ inline MatmulParams(){}; using PARAMS = MatmulParamsNormQuant; }; #else template struct MatmulParams::value && IsSameType::value) || (IsSameType::value && IsSameType::value))>::type> { __aicore__ inline MatmulParams(){}; using PARAMS = MatmulParamsNormQuant; }; #endif // CFG_MDL template struct MatmulParams { __aicore__ inline MatmulParams() {}; using PARAMS = MatmulParamsMDL; }; // MM_CFG_BB template struct MatmulParams { __aicore__ inline MatmulParams() {}; using PARAMS = MatmulParamsBasicBlock; }; // CFG_IBSHARE_NORM template struct MatmulParams { __aicore__ inline MatmulParams() {}; using PARAMS = MatmulParamsIBShareNorm; }; /* ************************************************************************************************** * MatmulMacroImpl * * ************************************************************************************************* */ template struct MatmulMacroImpl { __aicore__ inline MatmulMacroImpl() {}; }; #if __CCE_AICORE__ >= 220 // CFG_NORM template struct MatmulMacroImpl { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulMacroImpl() {}; static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); using PARAMS = MacroMatmul; }; // CFG_MDL template struct MatmulMacroImpl { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulMacroImpl() {}; static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); using PARAMS = MacroMatmul; }; // CFG_IBSHARE_NORM template struct MatmulMacroImpl { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulMacroImpl() {}; static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); using PARAMS = MacroMatmul; }; #elif __CCE_AICORE__ == 200 template struct MatmulMacroImpl { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulMacroImpl() {}; using PARAMS = MacroMatmulV200; }; template struct MatmulMacroImpl { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulMacroImpl() {}; using PARAMS = MacroMatmulV200; }; #endif // MM_CFG_BB template struct MatmulMacroImpl { using L0cT = typename GetDstType::Type; __aicore__ inline MatmulMacroImpl() {}; using PARAMS = MacroMatmulBasic; }; template > class MatmulImpl { using L0cT = typename GetDstType::Type; using SrcT = typename A_TYPE::T; using SrcAT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; using DstT = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; public: __aicore__ inline MatmulImpl() {}; __aicore__ inline void Init(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe = nullptr); __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK); __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0); __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK); __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1); __aicore__ inline void SetTensorA(const GlobalTensor& gm, bool isTransposeA = false); __aicore__ inline void SetTensorB(const GlobalTensor& gm, bool isTransposeB = false); __aicore__ inline void SetBias(const GlobalTensor& biasGlobal); __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr); __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr); __aicore__ inline void SetAntiQuantScalar(const SrcT offsetScalar, const SrcT scaleScalar); __aicore__ inline void SetAntiQuantVector(const LocalTensor &offsetTensor, const LocalTensor &scaleTensor); __aicore__ inline void SetQuantScalar(const uint64_t quantScalar); __aicore__ inline void SetQuantVector(const GlobalTensor& quantTensor); __aicore__ inline void SetTensorA(const LocalTensor& leftMatrix, bool isTransposeA = false); __aicore__ inline void SetTensorAWithCopy(const GlobalTensor& gm, const LocalTensor& leftMatrix, bool isTransposeA = false); __aicore__ inline void SetTensorB(const LocalTensor& righMatrix, bool isTransposeB = false); __aicore__ inline void SetTensorA(SrcAT aScalar); __aicore__ inline void SetTensorB(SrcBT bScalar); __aicore__ inline void SetTensorBWithCopy(const GlobalTensor& gm, const LocalTensor& righMatrix, bool isTransposeB = false); __aicore__ inline void SetBias(const LocalTensor& inputBias); __aicore__ inline void SetBatchNum(int32_t batchA, int32_t batchB); __aicore__ inline void ClearBias(); template __aicore__ inline bool Iterate(bool enPartialSum = false); template __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false); template __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0); __aicore__ inline void IterateBatch(const GlobalTensor& gm, uint32_t enPartialSum, uint8_t enAtomic, uint32_t enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0); template __aicore__ inline void GetTensorC(const LocalTensor& co2Local, uint8_t enAtomic = 0, bool enSequentialWrite = false); template __aicore__ inline void GetTensorC(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false); template __aicore__ inline void GetTensorC(const GlobalTensor &gm, const LocalTensor &co2Local, uint8_t enAtomic = 0, bool enSequentialWrite = false); template __aicore__ inline MatrixOffset GetOffsetC(); __aicore__ inline void End(); __aicore__ inline void SetHF32(bool enableHF32 = false, int32_t transMode = 0); __aicore__ inline void SetSubBlockIdx(uint8_t subBlockIdx); __aicore__ inline uint8_t GetSubBlockIdx(); template __aicore__ inline void SetWorkspace(__gm__ const T* addr, int size) { ASCENDC_ASSERT((addr != nullptr), { KERNEL_LOG(KERNEL_ERROR, "addr can not be nullptr"); }); var.cacheWorkspaceAddr = reinterpret_cast(const_cast<__gm__ T*>(addr)); } template __aicore__ inline void SetWorkspace(GlobalTensor& addr) { ASSERT(addr.GetSize() > 0); SetWorkspace(addr.GetPhyAddr(), addr.GetSize() * sizeof(T)); } __aicore__ inline void SetLocalWorkspace(const LocalTensor& tmpBuffer) { #if __CCE_AICORE__ < 220 __ubuf__ uint8_t* addr = (__ubuf__ uint8_t*)tmpBuffer.GetPhyAddr(); ASCENDC_ASSERT((addr != nullptr), { KERNEL_LOG(KERNEL_ERROR, "addr can not be nullptr"); }); var.localWorkspace = tmpBuffer; var.cacheUBWorkspaceAddr = reinterpret_cast<__ubuf__ uint8_t* __restrict__>(const_cast<__ubuf__ uint8_t*>(addr)); var.nd2nz0ffset = 0; var.transOffset = 0; var.co2Offset = 0; int len = 0; if constexpr (MM_CFG.enVecND2NZ) { if constexpr (A_TYPE::format == CubeFormat::ND || B_TYPE::format == CubeFormat::ND || !PhyPosIsUB(C_TYPE::pos)) { len = var.tiling_->transLength + var.tiling_->transLength; } if (var.tiling_->isBias && BIAS_TYPE::pos != TPosition::VECCALC) { len = len < var.tiling_->baseN * sizeof(BiasT) ? var.tiling_->baseN * sizeof(BiasT) : len; } } else { if (var.tiling_->isBias && BIAS_TYPE::pos != TPosition::VECCALC) { len += var.tiling_->baseN * sizeof(BiasT); } if constexpr (C_TYPE::pos == TPosition::GM) { var.co2Offset = len; len += var.tiling_->baseM * var.tiling_->baseN * sizeof(DstT); const int blockCount = ONE_BLK_SIZE / sizeof(DstT); if (C_TYPE::format == CubeFormat::ND && var.tiling_->singleCoreN % blockCount != 0) { var.transOffset = len; len += 32; } } else if constexpr (C_TYPE::pos == TPosition::VECCALC && C_TYPE::format != CubeFormat::NZ) { var.co2Offset = len; len += var.tiling_->baseM * var.tiling_->baseN * sizeof(DstT); } if constexpr (A_TYPE::format == CubeFormat::ND || B_TYPE::format == CubeFormat::ND) { var.nd2nz0ffset = len; int aTmp = 0; int bTmp = 0; const int c0Size = ONE_BLK_SIZE / sizeof(SrcT); if (!var.isTransposeA_ && (var.tiling_->singleCoreK % c0Size != 0)) { aTmp = var.tiling_->baseM * 32; } else if (var.isTransposeA_ && (var.tiling_->singleCoreM % c0Size != 0)) { aTmp = var.tiling_->baseK * 32; } bTmp = GetND2NZOffsetB(); aTmp = (A_TYPE::pos == TPosition::TSCM) ? 0 : aTmp; bTmp = (B_TYPE::pos == TPosition::TSCM) ? 0 : bTmp; len += (aTmp >= bTmp) ? aTmp : bTmp; } } int size = tmpBuffer.GetSize(); ASSERT(size >= len); #else ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "current vecrsion do not support SetLocalWorkspace interface!"); }); #endif } #ifdef __CCE_KT_TEST__ public: uint32_t a1BigPackageLoadCount_ = 0; uint32_t b1BigPackageLoadCount_ = 0; uint32_t a1LoadCacheCount_ = 0; uint32_t b1LoadCacheCount_ = 0; #endif private: template friend __aicore__ inline void SetTPipe( MatmulImpl &mm, TPipe* tpipe); __aicore__ inline void InitNorm(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe); __aicore__ inline void InitMDL(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe); __aicore__ inline void InitBatch(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe); __aicore__ inline void InitIBShareNorm(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe); template __aicore__ inline bool IterateNorm(bool enPartialSum = false); template __aicore__ inline bool IterateBasicBlock(bool enPartialSum = false); template __aicore__ inline bool IterateBasicSpecialBlock(bool enPartialSum = false); template __aicore__ inline bool IterateMDL(bool enPartialSum = false); template __aicore__ inline bool IterateIBShareNorm(bool enPartialSum = false); template __aicore__ inline bool IterateSpecialMDL(bool enPartialSum = false); __aicore__ inline void EndNorm(); __aicore__ inline void EndMDL(); __aicore__ inline void EndIBShareNorm(); __aicore__ inline void InitStepMParams(); __aicore__ inline void InitStepNParams(); __aicore__ inline void InitStepKParams(); __aicore__ inline void ResetCacheA(); __aicore__ inline void ResetCacheB(); __aicore__ inline void ResetCacheA1(); __aicore__ inline void ResetCacheB1(); __aicore__ inline void LoadC(LocalTensor& co1Local, bool enPartialSum = false); __aicore__ inline void LoadBias(LocalTensor& cMatrix, int col); __aicore__ inline void Compute(bool enPartialSum = false); __aicore__ inline void ComputeNorm(bool enPartialSum = false); __aicore__ inline void ComputeBasic(bool enPartialSum = false); __aicore__ inline void ComputeSpecialBasic(bool enPartialSum = false); __aicore__ inline void ComputeMDL(bool enPartialSum = false); __aicore__ inline void ComputeIBShareNorm(bool enPartialSum = false); __aicore__ inline void ComputeSpecialMDL(bool enPartialSum = false); __aicore__ inline void ComputeBatch(const GlobalTensor& gm, uint32_t enPartialSum, uint8_t enAtomic, uint32_t enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const int32_t batchOuterIdx = 0); __aicore__ inline void CalcBatchNum(const int32_t batchNumA, const int32_t batchNumB); // 1, Implement CacheA. The number of caches is depthA1. __aicore__ inline LocalTensor LoadACache2L1(int row, int col, int useM, int useK, int proc); __aicore__ inline LocalTensor LoadBCache2L1(int row, int col, int useK, int useN, int proc); __aicore__ inline LocalTensor LoadToAL1( int row, int col, int useN, int useK, bool insertDeQue = true); __aicore__ inline LocalTensor LoadToBL1( int row, int col, int useK, int useN, bool insertDeQue = true); __aicore__ inline LocalTensor LoadToAL1Norm(int row, int col, int useN, int useK); __aicore__ inline LocalTensor LoadToBL1Norm(int row, int col, int useN, int useK); __aicore__ inline LocalTensor LoadToAL1MDL( int row, int col, int useN, int useK, bool insertDeQue = true); __aicore__ inline LocalTensor LoadToBL1MDL( int row, int col, int useN, int useK, bool insertDeQue = true); __aicore__ inline LocalTensor LoadToAL1Basic(int row, int col, int useN, int useK); __aicore__ inline LocalTensor LoadToBL1Basic(int row, int col, int useN, int useK); __aicore__ inline LocalTensor LoadToAL1IBShareNorm(int row, int col, int useN, int useK); __aicore__ inline LocalTensor LoadToBL1IBShareNorm(int row, int col, int useN, int useK); __aicore__ inline bool OnCopyInA1(const LocalTensor& aMatrix, int row, int col, int useM, int useK); __aicore__ inline bool OnCopyInA1Trans(const LocalTensor& aMatrix, int row, int col, int useM, int useK); __aicore__ inline bool OnCopyInB1(const LocalTensor& bMatrix, int row, int col, int useK, int useN); __aicore__ inline bool OnCopyInB1Trans(const LocalTensor& bMatrix, int row, int col, int useK, int useN); __aicore__ inline void OnLoadInA2(const LocalTensor& dst, const LocalTensor& aMatrix); __aicore__ inline void OnLoadInB2(const LocalTensor& dst, const LocalTensor& bMatrix); __aicore__ inline void CopyND2NZ(const LocalTensor& dst, const GlobalTensor& src, const int row, const int col, const int height, const int width, const int gCol, const int ndNum = 1, const int srcNdMatrixStride = 0, const int dstNzMatrixStride = 0, const bool kAlignToC0Size = false); __aicore__ inline void CopyWeightND2NZ(const LocalTensor &dst, const GlobalTensor &src, const int row, const int col, const int height, const int width, const int gCol, const int ndNum = 1, const int srcNdMatrixStride = 0, const int dstNzMatrixStride = 0, const bool kAlignToC0Size = false); __aicore__ inline void CopyND2NZ(const LocalTensor& dst, LocalTensor& src, const int row, const int col, const int height, const int width, const int gCol, const bool isA1 = true); __aicore__ inline void AntiQuantCompute(const LocalTensor &quantOut, const LocalTensor &quantIn, bool isBankConflict); __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor& dst, GlobalTensor& src, const int row, const int col, const int height, const int width, const int gCol, const bool isA1 = true); __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor& dst, LocalTensor& src, const int row, const int col, const int height, const int width, const int gCol, const bool isA1 = true); __aicore__ inline void CopyNZ2NZ(const LocalTensor& dst, const GlobalTensor& src, const int row, const int col, const int height, const int width, const int gRow, const bool kAlignToC0Size = false); __aicore__ inline void CopyNZ2NZ(const LocalTensor& dst, const LocalTensor& src, const int row, const int col, const int height, const int width, const int gRow); __aicore__ inline void CopyVector2A1(const LocalTensor& dst, GlobalTensor& src, const int col); __aicore__ inline void CopyVector2A1(const LocalTensor& dst, LocalTensor& src, const int col); template __aicore__ inline int CopyNDBlock(const LocalTensor& transTensor, const GlobalTensor& src, int64_t srcOffset, const int height, const int width, const int gCol, const bool isBankConflict); template __aicore__ inline void NDPadZeros(LocalTensor &dst, const int height, const int calcWidth, const int gCol, const int width, bool isBankConflict); __aicore__ inline void NDTrans2NZ(LocalTensor& dst, LocalTensor& src, const int calcHigh, const int calcWidth, const bool isBankConflict); __aicore__ inline void TransDataBMatrix(const LocalTensor &dst, const LocalTensor &src, int height, int width); __aicore__ inline void UpdateDataCopyParamForQuant(DataCopyEnhancedParams& enhancedParams); __aicore__ inline void OnCopyInCO2(const LocalTensor& dst, const LocalTensor& src, bool enSequentialWrite = false); __aicore__ inline void OnCopyToCO2(const LocalTensor& dst, const LocalTensor& src, bool enSequentialWrite = false); __aicore__ inline void CopyCo22UBNZ2ND(const LocalTensor& dst, const LocalTensor& src, bool enSequentialWrite = false); __aicore__ inline void TransNZ2ND(const LocalTensor& dst, const LocalTensor& src, int height, int width, DstT scalar); __aicore__ inline void CopyToGMForNotAligned(const GlobalTensor &gmC, LocalTensor &trans, int32_t blocklen, bool enSequentialWrite = false, bool isTragetAligned = false); __aicore__ inline void CopyCo22GMNZ2ND(const GlobalTensor& gmC, LocalTensor& src, bool enSequentialWrite = false); __aicore__ inline void CopyCo22GMNZ2NDOnTheFly(const GlobalTensor& gmC, const LocalTensor& src, bool enSequentialWrite = false); __aicore__ inline void CopyFromDstGM(LocalTensor& src, const GlobalTensor& gmC, const CopyGMParams& params, bool enSequentialWrite = false); __aicore__ inline void OnCO2Copy2GM(const GlobalTensor& gmC, LocalTensor& src, bool enSequentialWrite = false); __aicore__ inline void CopyDeqTensorToL1(const LocalTensor& dst, const GlobalTensor& src, int32_t calNSize); __aicore__ inline void FixpipeOutToGm(const GlobalTensor &gm, const LocalTensor &co1Local, int curN, uint8_t enAtomic, bool enSequentialWrite); // do ping when isPong = flase, do pong when isPong = true __aicore__ inline TBufHandle GetCacheA1Buf(bool isPong); __aicore__ inline TBufHandle GetCacheB1Buf(bool isPong); __aicore__ inline bool GetCacheA1IsCaching(bool isPong); __aicore__ inline bool GetCacheB1IsCaching(bool isPong); __aicore__ inline void SetCacheA1Buf(bool isPong, TBufHandle buf); __aicore__ inline void SetCacheB1Buf(bool isPong, TBufHandle buf); __aicore__ inline void SetCacheA1IsCaching(bool isPong, bool isCaching); __aicore__ inline void SetCacheB1IsCaching(bool isPong, bool isCaching); __aicore__ inline void CheckIterSize(); __aicore__ inline void CheckBaseUseSize(); __aicore__ inline void CheckTiling(); __aicore__ inline void UpdateBatchIterateInfo(const int32_t batchNum, const int32_t batchIdx); __aicore__ inline int32_t GetBatchIterateBiasOffset(const int32_t batchNum, const int32_t batchIdx, bool& enableBiase); __aicore__ inline int32_t GetBatchIterateBOffset(const int32_t batchNum, const int32_t batchIdx); __aicore__ inline int32_t GetBatchIterateAOffset(const int32_t batchNum, const int32_t batchIdx); __aicore__ inline void LoadBatchBiasToL1(const int32_t batchOuterIdx = 0); __aicore__ inline void LoadBatchBToL1(const uint32_t matrixStrideB = 0, const int32_t batchOuterIdx = 0); __aicore__ inline void OnCopyInBatchB1Trans(const LocalTensor& aMatrix, const int32_t batchOuterIdx); __aicore__ inline void LoadBatchAToL1(const uint32_t matrixStrideA = 0, const int32_t batchOuterIdx = 0); __aicore__ inline void OnCopyInBatchA1Trans(const LocalTensor& aMatrix, const int32_t batchOuterIdx); __aicore__ inline int32_t GetOrgBH(); __aicore__ inline int32_t GetOrgAH(); __aicore__ inline void GetTensorCForBatch( const GlobalTensor &cGlobal, const int32_t iBatchIn, uint8_t enAtomic, bool enSequentialWriteIn); __aicore__ inline void GetTensorCByLayout(const GlobalTensor &cGlobal, uint8_t enAtomic, bool enSequentialWrite, const uint32_t nGapOffset, const uint32_t mGapOffset); __aicore__ inline void GetTensorCSpecialMDL(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false); __aicore__ inline int GetND2NZOffsetB(); __aicore__ inline void SetTransposeB(bool isTransposeB); private: #if __CCE_AICORE__ == 220 || __CCE_AICORE__ == 300 || __CCE_AICORE__ == 200 typename MatmulMacroImpl::PARAMS matmulInstr_; #endif typename MatmulParams::PARAMS var; #if __CCE_AICORE__ < 220 constexpr static int L1Size_ = 1024 * 1024; constexpr static int L0CSize_ = 256 * 1024; #elif __CCE_AICORE__ == 300 constexpr static int L1Size_ = 1024 * 1024; constexpr static int L0CSize_ = 128 * 1024; #else constexpr static int L1Size_ = 512 * 1024; constexpr static int L0CSize_ = 128 * 1024; #endif constexpr static int L0ASize_ = 64 * 1024; constexpr static int L0BSize_ = 64 * 1024; constexpr static int32_t factor_ = AuxGetFactor(); constexpr static int32_t c0Size_ = AuxGetC0Size(); int M_; int N_; int Ka_; int Kb_; int Kc_; int32_t batchA_ = 1, batchB_ = 1; int32_t batchOuter_ = 1; }; } // namespace matmul #include "../../impl/matmul/matmul_impl.h" #endif