/** * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* ! * \file kernel_tpipe.h * \brief */ #ifndef ASCENDC_KERNEL_QUEUE_H #define ASCENDC_KERNEL_QUEUE_H #include "kernel_tpipe_base.h" #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 #include #include #include #include #endif namespace AscendC { class TPipe; template class TQueBind { public: __aicore__ inline TQueBind(); __aicore__ inline void FreeBuffer(TBufHandle buf); __aicore__ inline TBuffAddr GetBufferAddr(TBufHandle buf); template __aicore__ inline __sync_noalias__ LocalTensor AllocTensor(); template __aicore__ inline void FreeTensor(LocalTensor& tensor); template __aicore__ inline bool EnQue(const LocalTensor& tensor); __aicore__ inline bool EnQue(TBufHandle buf); template __aicore__ inline LocalTensor DeQue(); __aicore__ inline TBufHandle DeQue(); __aicore__ inline bool VacantInQue(); __aicore__ inline bool HasTensorInQue(); __aicore__ inline int32_t GetTensorCountInQue(); __aicore__ inline bool HasIdleBuffer(); __aicore__ inline void FreeAllEvent(); template __aicore__ inline TBufState GetState(const LocalTensor& tensor) const; protected: static constexpr TQueConfig config = GetTQueConfig(mask); static constexpr bool nd2nz = config.nd2nz; static constexpr bool nz2nd = config.nz2nd; static constexpr bool scmBlockGroup = config.scmBlockGroup; static constexpr TPosition srcPosition = src; static constexpr TPosition dstPosition = dst; static constexpr Hardware srcHardType = GetPhyType(src); static constexpr Hardware dstHardType = GetPhyType(dst); static constexpr HardEvent enQueEvt = GetQueEvt(srcHardType, dstHardType, true, nd2nz, nz2nd); static constexpr HardEvent freeBufEvt = GetQueEvt(srcHardType, dstHardType, false, nd2nz, nz2nd); static constexpr int32_t queDepth = depth; union { uint64_t value; struct { uint8_t bufNum = 0; uint8_t usedCount; uint16_t head; uint16_t tail; uint8_t bufUsedCount; uint8_t bufCursor; }; }; typename TBufHandleAux::T que_; struct TBufType* bufStart; DEBUG_CODE(uint32_t bufLen); friend class TPipe; template friend class TQue; template friend class TBufPool; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 uint64_t bufPoolHandle{0U}; #endif private: __aicore__ inline void SetTBufPoolHandle(uint64_t bufPoolHandle); template __aicore__ inline LocalTensor Buf2Tensor(TBufHandle buf); __aicore__ inline TBufState GetState(const TBufHandle& handle) const; static constexpr bool isTQue = true; __aicore__ inline TBufHandle AllocBuffer(); }; // Template Args: // pos - position for queue, suach as VECIN/VECOUT/A1... // mask - the 0th bit is nd2nz, 1 means data trans from nd format to nz format // the 1st bit is nz2nd, 1 means data trans from nz format to nd format template class TQue : public TQueBind { public: __aicore__ inline TQue() = default; private: friend class TPipe; template friend class TBufPool; static constexpr bool isTQue = true; }; template class TBuf : public TQueBind { public: __aicore__ inline TBuf() = default; template __aicore__ inline LocalTensor Get(); template __aicore__ inline LocalTensor Get(uint32_t len); template __aicore__ inline LocalTensor GetWithOffset(uint32_t size, uint32_t bufOffset); // inheritance function from Class TQueBind template __aicore__ inline void EnQue(const LocalTensor& tensor); template __aicore__ inline LocalTensor DeQue(); template __aicore__ inline LocalTensor AllocTensor(); template __aicore__ inline void FreeTensor(LocalTensor& tensor); template __aicore__ inline TBufState GetState(const LocalTensor& tensor) const; __aicore__ inline bool EnQue(TBufHandle buf); __aicore__ inline TBufHandle DeQue(); __aicore__ inline void FreeBuffer(TBufHandle buf); __aicore__ inline TBuffAddr GetBufferAddr(TBufHandle buf); private: __aicore__ inline TBufHandle Get(); __aicore__ inline TBufHandle Get(uint32_t len); __aicore__ inline uint32_t GetBufLen() const; __aicore__ inline void SetTpipeBuf(TBufType* bufStartIn, uint32_t bufLenIn); template friend __aicore__ inline bool PopStackBuffer(TBuf &popBuffer, TBufType &bufStart); __aicore__ inline TBufHandle AllocBuffer(); private: struct TBufType* bufStart; uint32_t bufLen; uint32_t offset; friend class TPipe; template friend class TBufPool; static constexpr bool isTQue = false; }; template class TBufPool { public: static constexpr TPosition poolPos = pos; public: __aicore__ inline TBufPool(); __aicore__ inline ~TBufPool(); template __aicore__ inline bool InitBuffer(T& que, uint8_t num, uint32_t len); template __aicore__ inline bool InitBuffer(TBuf& buf, uint32_t len); template __aicore__ inline bool InitBufPool(T& bufPool, uint32_t len, U& shareBuf); template __aicore__ inline bool InitBufPool(T& bufPool, uint32_t len); __aicore__ inline void Reset(); protected: TBufPoolImpl g_tBufPoolImpl; private: __aicore__ inline void Init(); __aicore__ inline void ResetPool(); private: friend class TPipe; template friend class TQueBind; template friend class TQue; template friend class TBuf; static constexpr bool isTbufPool = true; }; class TPipe { public: __aicore__ inline TPipe(); __aicore__ inline ~TPipe(); __aicore__ inline void Init(); template __aicore__ inline bool InitBuffer(T& que, uint8_t num, uint32_t len); template __aicore__ inline bool InitBuffer(TBuf& buf, uint32_t len); template __aicore__ inline bool InitBufPool(T& bufPool, uint32_t len); template __aicore__ inline bool InitBufPool(T& bufPool, uint32_t len, U& shareBuf); template __aicore__ inline TEventID AllocEventID(); template __aicore__ inline void ReleaseEventID(TEventID id); template __aicore__ inline TEventID FetchEventID(); __aicore__ inline TEventID FetchEventID(HardEvent evt); template __aicore__ inline LocalTensor GetAbsAddr(int32_t offset, int32_t size) const; template __aicore__ inline TBuffAddr GetAbsAddr(int32_t offset, int32_t len) const; __aicore__ inline void InitShareBufStart(uint32_t mode, uint32_t* shareLens, uint32_t lens, uint8_t subBlockIdx); __aicore__ inline void InitShareBufEnd(); /* * brief: these functions are used to use spm buffer; * demo case: * GlobalTensor workTensor; * tpipe.InitSpmBuffer(workTensor, size); * LocalTensor calcTensor = tpip.Get(size); * // when local buffer is not enough, spill local to spm buffer; * tpipe.WriteSpmBuffer(calcTensor, size); * // ... * // read buffer from spm buffer into local * tpipe.ReadSpmBuffer(calcTensor, size); */ template __aicore__ inline void InitSpmBuffer(const GlobalTensor& workspace, const int32_t bufferSize); __aicore__ inline void InitSpmBuffer(const int32_t bufferSize); template __aicore__ inline void WriteSpmBuffer(const LocalTensor& writeLocal, const DataCopyParams& copyParams, int32_t writeOffset = 0); template __aicore__ inline void ReadSpmBuffer(const LocalTensor& readLocal, const DataCopyParams& copyParams, int32_t readOffset = 0); template __aicore__ inline void WriteSpmBuffer(const LocalTensor& writeLocal, const int32_t writeSize, int32_t writeOffset = 0); template __aicore__ inline void ReadSpmBuffer(const LocalTensor& readLocal, const int32_t readSize, int32_t readOffset = 0); __aicore__ inline void Destroy(); __aicore__ inline void Reset(); #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 template inline uint64_t GetAbsAddr(const LocalTensor& tensor); inline uint8_t* GetBaseAddr(int8_t logicPos); #endif protected: TPipeImpl g_tpipeImpl; __aicore__ inline void AuxShareBufStart(uint32_t mode, uint32_t* shareLens, uint8_t pos, Hardware hard, uint8_t subBlockIdx); template friend class TQueBind; template friend class TQue; template friend class TBuf; template friend class TBufPool; template friend __aicore__ inline bool PopStackBuffer(TBuf& popBuffer, TBufType& bufStart); template friend __aicore__ inline bool PopStackBuffer(LocalTensor& popLocal); #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 void inline SetBufferCtx(Hardware hard, struct BufPoolExtra* bufPool); #endif private: #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 template friend inline uint64_t GetAbsAddr(TPipe* tpipe, const LocalTensor& tensor); #endif friend __aicore__ inline void InitShareBufStart(TPipe* tpipe, uint32_t mode, uint32_t* shareLens, uint32_t lens, uint8_t subBlockIdx); friend __aicore__ inline void InitShareBufEnd(TPipe* tpipe); __aicore__ inline void InitSocState() const; __aicore__ inline void ResetPool(); template __aicore__ inline bool TscmInitBuffer(T& que, uint8_t num, uint32_t len); /* * brief: these functions are used to get end and queueend addr. */ template __aicore__ inline uint64_t GetQueueEndAddress(); }; template using TSCM = TQueBind; } // namespace AscendC #endif // ASCENDC_KERNEL_QUEUE_H