/** * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /*! * \file kernel_tpipe_interface_impl.h * \brief */ #ifndef ASCENDC_MODULE_TPIPE_INTERFACE_IMPL_H #define ASCENDC_MODULE_TPIPE_INTERFACE_IMPL_H #include "kernel_tpipe.h" namespace AscendC { template __aicore__ inline __in_pipe__(V) void NopInPipeV(const T &tensor) { (void)(0); } template __aicore__ inline __out_pipe__(V) void NopOutPipeV(const T &tensor) { (void)(0); } // begin impl of IsAivTscm used by tquebind __aicore__ inline constexpr bool IsAivTscm(TPosition src, TPosition dst) { #if __CCE_AICORE__ == 220 if (GetPosition(src, dst) == TPosition::TSCM) { return true; } #else (void)(src); (void)(dst); #endif return false; } // begin impl of tquebind // TQueBind : this is used for off-standard queue template __aicore__ inline TQueBind::TQueBind() { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 AscendCQueCreate(static_cast(src), static_cast(dst), depth); #endif // __CCE_KT_TEST__ } template template __aicore__ inline __sync_noalias__ LocalTensor TQueBind::AllocTensor() { auto buf = AllocBuffer(); return Buf2Tensor(buf); } template template __aicore__ inline void TQueBind::FreeTensor(LocalTensor& tensor) { FreeBuffer(tensor.GetBufferHandle()); return; } template template __aicore__ inline bool TQueBind::EnQue(const LocalTensor& tensor) { if constexpr (GetPhyType(src) == Hardware::UB || GetPhyType(dst) == Hardware::UB) { NopInPipeV>(tensor); } auto buf = tensor.GetBufferHandle(); return EnQue(reinterpret_cast(buf)); } template __aicore__ inline bool TQueBind::EnQue(TBufHandle buf) { ASCENDC_ASSERT((this->usedCount < depth), { KERNEL_LOG(KERNEL_ERROR, "usedCount is %d, which exceed depth limits %d", static_cast(usedCount), depth); }); auto ptr = reinterpret_cast(buf); if constexpr (depth == 1) { this->que_ = buf; } else { this->que_[this->tail] = buf; } this->usedCount++; ASCENDC_ASSERT((this->bufStart <= ptr && ptr < this->bufStart + this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "ptr is %p, which should be in range [%p, %p]", ptr, this->bufStart, this->bufStart + this->bufNum); }); ASCENDC_ASSERT((ptr->state == TBufState::OCCUPIED) || (ptr->state == TBufState::DEQUE), { KERNEL_LOG(KERNEL_ERROR, "ptr state is %d, which should be OCCUPIED / DEQUE", static_cast(ptr->state)); }); DEBUG_CODE(ptr->state = TBufState::ENQUE); /* Add for TSCM * for 220, aiv just send message, no need add this set/wait */ #if __CCE_AICORE__ == 220 // If the AIC is not entered, the AIV does not process any event ID. if (g_coreType != AIV || (GetPosition(src, dst) != TPosition::TSCM)) { auto enQueEvtID = GetTPipePtr()->AllocEventID(); SetFlag(enQueEvtID); ptr->enQueEvtID = enQueEvtID; } #else auto enQueEvtID = GetTPipePtr()->AllocEventID(); SetFlag(enQueEvtID); ptr->enQueEvtID = enQueEvtID; #endif if constexpr (depth != 1) { if (++this->tail >= depth) { this->tail = 0; } } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; AscendCBufEnque(static_cast(src), static_cast(dst), static_cast(GetPosition(src, dst)), reinterpret_cast(absAddr + ptr->address)); #endif // __CCE_KT_TEST__ return true; } template template __aicore__ inline LocalTensor TQueBind::DeQue() { auto buf = DeQue(); auto ret = Buf2Tensor(buf); if constexpr (GetPhyType(src) == Hardware::UB || GetPhyType(dst) == Hardware::UB) { NopOutPipeV>(ret); } return ret; } template __aicore__ inline TBufHandle TQueBind::DeQue() { TBufHandle buf; if constexpr (depth == 1) { buf = this->que_; } else { buf = this->que_[this->head]; } ASCENDC_ASSERT((buf != nullptr), { KERNEL_LOG(KERNEL_ERROR, "buf can not be nullptr"); }); auto ptr = reinterpret_cast(buf); ASCENDC_ASSERT((ptr->state == TBufState::ENQUE), { KERNEL_LOG(KERNEL_ERROR, "ptr state is %d, which can only be ENQUE", static_cast(ptr->state)); }); ASCENDC_ASSERT((this->usedCount > 0), { KERNEL_LOG(KERNEL_ERROR, "usedCount is %d, which can only larger than 0", static_cast(this->usedCount)); }); this->usedCount--; /* Add for TSCM * for 220, aiv just send message, no need add this set/wait */ DEBUG_CODE(ptr->state = TBufState::DEQUE); #if __CCE_AICORE__ == 220 if (g_coreType != AIV || (GetPosition(src, dst) != TPosition::TSCM)) { if (ptr->enQueEvtID != INVALID_TEVENTID) { WaitFlag(ptr->enQueEvtID); GetTPipePtr()->ReleaseEventID(ptr->enQueEvtID); ptr->enQueEvtID = INVALID_TEVENTID; } } #else if (ptr->enQueEvtID != INVALID_TEVENTID) { WaitFlag(ptr->enQueEvtID); GetTPipePtr()->ReleaseEventID(ptr->enQueEvtID); ptr->enQueEvtID = INVALID_TEVENTID; } #endif if constexpr (depth != 1) { if (++this->head >= depth) { this->head = 0; } } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; AscendCBufDeque(static_cast(src), static_cast(dst), static_cast(GetPosition(src, dst)), (uint64_t)(absAddr + ptr->address)); #endif // __CCE_KT_TEST__ return reinterpret_cast(buf); } template __aicore__ inline void TQueBind::FreeBuffer(TBufHandle buf) { auto ptr = reinterpret_cast(buf); ASCENDC_ASSERT((this->bufStart <= ptr && ptr < this->bufStart + this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "ptr is %p, which should be in range [%p, %p]", ptr, this->bufStart, this->bufStart + this->bufNum); }); ASCENDC_ASSERT((ptr->state != TBufState::FREE), { KERNEL_LOG(KERNEL_ERROR, "ptr state is %d, which can not be FREE", static_cast(ptr->state)); }); if constexpr (!IsAivTscm(src, dst)) { if constexpr (src == TPosition::C1 || (src == TPosition::CO2 && dst == TPosition::VECIN)) { SetFlag(0); // insert pipe_v without eventID ASCENDC_ASSERT((ptr->freeBufEvtID == INVALID_TEVENTID), { KERNEL_LOG(KERNEL_ERROR, "freebuf event id can not be -1"); }); } else { ptr->freeBufEvtID = GetTPipePtr()->AllocEventID(); SetFlag(ptr->freeBufEvtID); } } ptr->state = TBufState::FREE; this->bufUsedCount--; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; AscendCBufFree(static_cast(bufferType), static_cast(GetPosition(src, dst)), (uint64_t)(absAddr + ptr->address), static_cast(ptr->dataLen)); #endif // __CCE_KT_TEST__ return; } template __aicore__ inline void TQueBind::SetTBufPoolHandle(uint64_t bufPoolHandle) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 this->bufPoolHandle = bufPoolHandle; #else (void)(bufPoolHandle); #endif } template __aicore__ inline TBufHandle TQueBind::AllocBuffer() { DEBUG_CODE(int32_t size = 0); ASCENDC_ASSERT((bufNum > 0), { KERNEL_LOG(KERNEL_ERROR, "bufNum is %d, which must be larger than 0", static_cast(bufNum)); }); TBufType* ret; do { ret = this->bufStart + this->bufCursor; if constexpr (config.bufferNumber != 1) { this->bufCursor += 1; if (this->bufCursor == this->bufNum) { this->bufCursor = 0; } } if (ret->state == TBufState::FREE) { ret->state = TBufState::OCCUPIED; if constexpr (IsAivTscm(src, dst)) { break; } if constexpr (src == TPosition::C1) { if (ret->freeBufEvtID != INVALID_TEVENTID) { WaitFlag(ret->freeBufEvtID); GetTPipePtr()->ReleaseEventID(ret->freeBufEvtID); ret->freeBufEvtID = INVALID_TEVENTID; } } else { if (ret->freeBufEvtID != INVALID_TEVENTID) { WaitFlag(ret->freeBufEvtID); GetTPipePtr()->ReleaseEventID(ret->freeBufEvtID); ret->freeBufEvtID = INVALID_TEVENTID; } } break; } ASCENDC_ASSERT((++size <= this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "size is %d, which exceed limits %d", size, static_cast(this->bufNum)); }); } while (true); this->bufUsedCount++; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; AscendCBufAlloc(static_cast(bufferType), static_cast(GetPosition(src, dst)), reinterpret_cast(absAddr + ret->address), static_cast(ret->dataLen)); if (this->bufPoolHandle != 0U) { AscendCUpdateTbufPoolStatus(this->bufPoolHandle, false); AscendCTBufPoolResetCheck(static_cast(GetPosition(srcPosition, dstPosition)), reinterpret_cast(absAddr + ret->address), static_cast(ret->dataLen), this->bufPoolHandle); } #endif // __CCE_KT_TEST__ return reinterpret_cast(ret); } template __aicore__ inline bool TQueBind::VacantInQue() { return usedCount < depth; } template __aicore__ inline bool TQueBind::HasTensorInQue() { return usedCount; } template __aicore__ inline int32_t TQueBind::GetTensorCountInQue() { return usedCount; } template __aicore__ inline bool TQueBind::HasIdleBuffer() { return bufUsedCount < bufNum; } template __aicore__ inline void TQueBind::FreeAllEvent() { auto ptr = this->bufStart; for (int i = 0; i < this->bufNum; i++, ptr++) { // should be in deque status ASCENDC_ASSERT((ptr->enQueEvtID == INVALID_TEVENTID), { KERNEL_LOG(KERNEL_ERROR, "enque event id can not be -1"); }); if (ptr->freeBufEvtID != INVALID_TEVENTID) { WaitFlag(ptr->freeBufEvtID); GetTPipePtr()->ReleaseEventID(ptr->freeBufEvtID); ptr->freeBufEvtID = INVALID_TEVENTID; } } } template __aicore__ inline TBuffAddr TQueBind::GetBufferAddr(TBufHandle buf) { ASCENDC_ASSERT((GetPosition(src, dst) != TPosition::GM), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be GM"); }); auto ptr = reinterpret_cast(buf); ASCENDC_ASSERT((this->bufStart <= ptr && ptr < this->bufStart + this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "ptr is %p, which should be in range [%p, %p]", ptr, this->bufStart, this->bufStart + this->bufNum); }); TBuffAddr addr; addr.logicPos = static_cast(GetPosition(src, dst)); addr.bufferHandle = buf; addr.bufferAddr = ptr->address; addr.dataLen = ptr->dataLen; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; addr.absAddr = absAddr + addr.bufferAddr; #endif return addr; } template template __aicore__ inline TBufState TQueBind::GetState(const LocalTensor& tensor) const { return GetState(tensor.GetBufferHandle()); } template template __aicore__ inline __sync_noalias__ LocalTensor TQueBind::Buf2Tensor(TBufHandle buf) { TBuffAddr addr = GetBufferAddr(buf); LocalTensor tensor; tensor.SetAddr(addr); return tensor; } template __aicore__ inline TBufState TQueBind::GetState(const TBufHandle& handle) const { if (handle == nullptr) { return TBufState::FREE; } auto ptr = reinterpret_cast(handle); ASCENDC_ASSERT((this->bufStart <= ptr && ptr < this->bufStart + this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "ptr is %p, which should be in range [%p, %p]", ptr, this->bufStart, this->bufStart + this->bufNum); }); return ptr->state; } // begin impl of tbuf template template __aicore__ inline __sync_noalias__ LocalTensor TBuf::Get(uint32_t len) { uint32_t dataLen; if constexpr (IsSameType::value) { dataLen = len / INT4_TWO; } else { dataLen = len * sizeof(T); } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); ASCENDC_ASSERT((dataLen % 32 == 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be times of 32 Bytes", len); }); ASCENDC_ASSERT((dataLen <= bufLen), { KERNEL_LOG(KERNEL_ERROR, "len is %u, max buffer len is %u", dataLen, bufLen); }); #endif auto ptr = this->bufStart; ptr->dataLen = dataLen; TBuffAddr addr; addr.logicPos = static_cast(pos); addr.bufferHandle = reinterpret_cast(ptr); addr.bufferAddr = ptr->address; addr.dataLen = ptr->dataLen; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(GetPhyType(pos))].absAddr; addr.absAddr = absAddr + addr.bufferAddr; AscendCBufGet(addr.logicPos, static_cast(GetPhyType(pos)), reinterpret_cast(addr.absAddr), len); if (this->bufPoolHandle != 0U) { AscendCUpdateTbufPoolStatus(this->bufPoolHandle, false); AscendCTBufPoolResetCheck(static_cast(GetPhyType(pos)), reinterpret_cast(absAddr + ptr->address), static_cast(ptr->dataLen), this->bufPoolHandle); } #endif LocalTensor tensor; tensor.SetAddr(addr); return tensor; } template template __aicore__ inline __sync_noalias__ LocalTensor TBuf::Get() { if constexpr (IsSameType::value) { return Get(bufLen * INT4_TWO); } else { return Get(bufLen / sizeof(T)); } } template template __aicore__ inline __sync_noalias__ LocalTensor TBuf::GetWithOffset(uint32_t size, uint32_t bufOffset) { auto ptr = this->bufStart; ptr->dataLen = size * sizeof(T); TBuffAddr addr; addr.logicPos = static_cast(pos); addr.bufferHandle = reinterpret_cast(ptr); addr.bufferAddr = ptr->address + bufOffset; addr.dataLen = ptr->dataLen; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto absAddr = GetTPipePtr()->GetBaseAddr(static_cast(pos)); addr.absAddr = absAddr + addr.bufferAddr; #endif LocalTensor tensor; tensor.SetAddr(addr); return tensor; } template __aicore__ inline void TBuf::SetTpipeBuf(TBufType* bufStartIn, uint32_t bufLenIn) { this->bufStart = bufStartIn; this->bufLen = bufLenIn; this->offset = 0; } template template __aicore__ inline void TBuf::EnQue(const LocalTensor& tensor) { (void)(0); } template template __aicore__ inline LocalTensor TBuf::DeQue() { return Get(); } template template __aicore__ inline __sync_noalias__ LocalTensor TBuf::AllocTensor() { return Get(); } template template __aicore__ inline void TBuf::FreeTensor(LocalTensor& tensor) { (void)(0); } template template __aicore__ inline TBufState TBuf::GetState(const LocalTensor& tensor) const { TBufHandle handle = tensor.GetBufferHandle(); if (handle == nullptr) { return TBufState::FREE; } auto ptr = reinterpret_cast(handle); return ptr->state; } template __aicore__ inline bool TBuf::EnQue(TBufHandle buf) { return true; } template __aicore__ inline TBufHandle TBuf::DeQue() { return Get(); } template __aicore__ inline TBufHandle TBuf::AllocBuffer() { return Get(); } template __aicore__ inline void TBuf::FreeBuffer(TBufHandle buf) { (void)(0); } template __aicore__ inline TBuffAddr TBuf::GetBufferAddr(TBufHandle buf) { auto ptr = reinterpret_cast(buf); TBuffAddr addr; addr.logicPos = static_cast(pos); addr.bufferHandle = buf; addr.bufferAddr = ptr->address; addr.dataLen = ptr->dataLen; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(GetPhyType(pos))].absAddr; addr.absAddr = absAddr + addr.bufferAddr; #endif return addr; } template __aicore__ inline TBufHandle TBuf::Get(uint32_t len) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((len <= bufLen), { KERNEL_LOG(KERNEL_ERROR, "len is %u, max buffer len is %u", len, bufLen); }); #endif this->bufStart->dataLen = len; return reinterpret_cast(this->bufStart); } template __aicore__ inline TBufHandle TBuf::Get() { return Get(bufLen); } template __aicore__ inline uint32_t TBuf::GetBufLen() const { return bufLen; } // begin impl of tpipe __aicore__ inline TPipe::TPipe() { InitSocState(); Init(); } __aicore__ inline TPipe::~TPipe() { if (g_tpipeImpl.isDestroy) { return; } Destroy(); }; __aicore__ inline void TPipe::Init() { ResetPool(); // for matmul macro, set flag M_MTE1 at the begining of operator, and also wait flag at the end. // matmul macro only use M_MTE1 event id 0 1 currently. #if __CCE_AICORE__ == 220 if ASCEND_IS_AIC { auto enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 0), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 0"); }); SetFlag(static_cast(enQueEvtID)); enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 1), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 1"); }); SetFlag(static_cast(enQueEvtID)); // For load Bias enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 2), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 2"); }); SetFlag(static_cast(enQueEvtID)); } #elif __CCE_AICORE__ == 300 auto enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 0), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 0"); }); SetFlag(static_cast(enQueEvtID)); enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 1), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 1"); }); SetFlag(static_cast(enQueEvtID)); // For load Bias enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 2), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 2"); }); SetFlag(static_cast(enQueEvtID)); #endif #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 for (int32_t i = 0; i < static_cast(Hardware::MAX); i++) { SetBufferCtx((Hardware)i, &g_tpipeImpl.bufPoolBaseAddr_[i]); } auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; AscendCBufAbsAddr(uint8_t(Hardware::UB), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuUB)), bufferInitLen.at(Hardware::UB)); AscendCBufAbsAddr(uint8_t(Hardware::L1), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuL1)), bufferInitLen.at(Hardware::L1)); AscendCBufAbsAddr(uint8_t(Hardware::L0A), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuL0A)), bufferInitLen.at(Hardware::L0A)); AscendCBufAbsAddr(uint8_t(Hardware::L0B), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuL0B)), bufferInitLen.at(Hardware::L0B)); AscendCBufAbsAddr(uint8_t(Hardware::L0C), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuL0C)), bufferInitLen.at(Hardware::L0C)); AscendCBufAbsAddr(uint8_t(Hardware::BIAS), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuBIAS)), bufferInitLen.at(Hardware::BIAS)); AscendCBufAbsAddr(uint8_t(Hardware::FIXBUF), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuFIXBUF)), bufferInitLen.at(Hardware::FIXBUF)); #endif #if __CCE_AICORE__ == 220 #ifdef __DAV_C220_CUBE__ g_cubeTPipePtr = this; #elif defined(__DAV_C220_VEC__) g_vecTPipePtr = this; #else g_tPipePtr = this; #endif #else g_tPipePtr = this; #endif g_tpipeImpl.isDestroy = false; } template __aicore__ inline bool TPipe::InitBuffer(T& que, uint8_t num, uint32_t len) { static_assert((T::isTQue), "TPipe::InitBuffer(T& que, uint8_t num, uint32_t len) not supports T as TBuf"); ASCENDC_ASSERT((que.config.bufferNumber == 0 || que.config.bufferNumber == num), { KERNEL_LOG( KERNEL_ERROR, "buffer number is %u, which shoud be the same as TQueConfig::bufferNumber(%u)", num, que.config.bufferNumber); }); ASCENDC_ASSERT((que.config.bufferLen == 0 || que.config.bufferLen == len), { KERNEL_LOG( KERNEL_ERROR, "buffer length is %u, which shoud be the same as TQueConfig::bufferLen(%u)", len, que.config.bufferLen); }); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); if constexpr (T::dstPosition == TPosition::TSCM) { return TscmInitBuffer(que, num, len); } len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; que.value = num; que.bufStart = this->g_tpipeImpl.buf_ + this->g_tpipeImpl.curBufSize_; DEBUG_CODE(que.bufLen = num * len); Hardware pool = GetBufferPos(T::srcPosition, T::dstPosition); ASCENDC_ASSERT((pool != Hardware::GM), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be Hardware::GM"); }); ASCENDC_ASSERT((pool != Hardware::MAX), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be Hardware::MAX"); }); auto curPoolAddr = this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; auto ptr = que.bufStart; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((num * len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", num * len, bufferInitLen.at(pool)); }); auto pos_ = GetPosition(T::srcPosition, T::dstPosition); auto absAddr = GetBaseAddr(static_cast(pos_)); AscendCBufInit(static_cast(pos_), 0, num, reinterpret_cast(curPoolAddr + absAddr), len); #endif for (int32_t i = 0; i < num; i++, ptr++) { ptr->state = TBufState::FREE; ptr->freeBufEvt = T::freeBufEvt; ptr->enQueEvtID = INVALID_TEVENTID; ptr->freeBufEvtID = INVALID_TEVENTID; ptr->address = curPoolAddr; ptr->dataLen = len; ptr->usertag = -1; curPoolAddr += len; } ASCENDC_ASSERT((curPoolAddr <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, limits is %d", curPoolAddr, bufferInitLen.at(pool)); }); this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr = curPoolAddr; this->g_tpipeImpl.curBufSize_ += num; ASCENDC_ASSERT((this->g_tpipeImpl.curBufSize_ < QBUF_MAX_LEN), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, limits is %d", this->g_tpipeImpl.curBufSize_, QBUF_MAX_LEN); }); ASCENDC_ASSERT( (this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr <= this->g_tpipeImpl.tscmBufferPtr_), { KERNEL_LOG(KERNEL_ERROR, "tscm addr is %d, limits is %d", this->g_tpipeImpl.tscmBufferPtr_, this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr); }); return true; } template __aicore__ inline bool TPipe::InitBuffer(TBuf& buf, uint32_t len) { ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr int32_t bufHandleSize = 1; buf.bufStart = this->g_tpipeImpl.buf_ + this->g_tpipeImpl.curBufSize_; buf.bufLen = len; buf.offset = 0; constexpr auto pool = GetPhyType(pos); ASCENDC_ASSERT((pool != Hardware::GM), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be Hardware::GM"); }); auto curPoolAddr = g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; auto ptr = buf.bufStart; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "len is %u, exceed limits %d", len, bufferInitLen.at(pool)); }); auto absAddr = GetBaseAddr(static_cast(pos)); AscendCBufInit(static_cast(pos), 1, 1, reinterpret_cast(curPoolAddr + absAddr), len); #endif for (uint8_t i = 0; i < bufHandleSize; i++, ptr++) { ptr->state = TBufState::FREE; ptr->enQueEvtID = INVALID_TEVENTID; ptr->freeBufEvtID = INVALID_TEVENTID; ptr->address = curPoolAddr; ptr->dataLen = len; ptr->usertag = -1; curPoolAddr += len; } ASCENDC_ASSERT((curPoolAddr <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, exceed limits %d", curPoolAddr, bufferInitLen.at(pool)); }); this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr = curPoolAddr; this->g_tpipeImpl.curBufSize_ += bufHandleSize; ASCENDC_ASSERT((this->g_tpipeImpl.curBufSize_ < QBUF_MAX_LEN), { KERNEL_LOG(KERNEL_ERROR, "current total buffer num is %d, exceed limits %d", this->g_tpipeImpl.curBufSize_, QBUF_MAX_LEN); }); return true; } template __aicore__ inline bool TPipe::InitBufPool(T &bufPool, uint32_t len) { static_assert( (T::isTbufPool), "TPipe::InitBufPool(T& bufPool, uint32_t len, U& shareBuf) only supports T as TbufPool"); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr auto pool = GetPhyType(T::poolPos); bufPool.g_tBufPoolImpl.startAddr_ = this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; bufPool.g_tBufPoolImpl.maxAddr_ = bufPool.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxLen_ = len; auto curPoolAddr = this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", len, bufferInitLen.at(pool)); }); auto pos = T::poolPos; auto absAddr = GetBaseAddr(static_cast(pos)); AscendCTBufPoolInit(static_cast(pos), reinterpret_cast(curPoolAddr + absAddr), len, reinterpret_cast(&bufPool.g_tBufPoolImpl)); #endif curPoolAddr += len; ASCENDC_ASSERT((curPoolAddr <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, limits is %d", curPoolAddr, bufferInitLen.at(pool)); }); this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr = curPoolAddr; ASCENDC_ASSERT( (this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr <= this->g_tpipeImpl.tscmBufferPtr_), { KERNEL_LOG(KERNEL_ERROR, "tscm addr is %d, limits is %d", this->g_tpipeImpl.tscmBufferPtr_, this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr); }); return true; } template __aicore__ inline bool TPipe::InitBufPool(T &bufPool, uint32_t len, U &shareBuf) { static_assert((T::isTbufPool && U::isTbufPool), "TPipe::InitBufPool(T& bufPool, uint32_t len, U& shareBuf) only supports T and U as TBufPool"); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr auto pool = GetPhyType(T::poolPos); ASCENDC_ASSERT((pool == GetPhyType(U::poolPos)), { KERNEL_LOG(KERNEL_ERROR, "Hardware type of input bufPool should be same with shareBuf"); }); bufPool.g_tBufPoolImpl.startAddr_ = shareBuf.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxAddr_ = bufPool.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxLen_ = shareBuf.g_tBufPoolImpl.maxLen_; ASCENDC_ASSERT((len <= shareBuf.g_tBufPoolImpl.maxLen_), { KERNEL_LOG(KERNEL_ERROR, "Length of input bufPool should be shorter than len of shareBuf, which is %u", shareBuf.g_tBufPoolImpl.maxLen_); }); #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", len, bufferInitLen.at(pool)); }); auto pos = T::poolPos; auto absAddr = GetBaseAddr(static_cast(pos)); AscendCTBufPoolInit(static_cast(pos), reinterpret_cast(bufPool.g_tBufPoolImpl.startAddr_ + absAddr), len, reinterpret_cast(&bufPool.g_tBufPoolImpl)); #endif return true; } template __aicore__ inline TEventID TPipe::AllocEventID() { ASCENDC_ASSERT((evt < HardEvent::MAX), { KERNEL_LOG(KERNEL_ERROR, "illegal event %d", static_cast(evt)); }); auto ptr = this->g_tpipeImpl.eventPool_ + EventToIndex(evt); auto lastId = sff0(ptr->eventOccupy); ASCENDC_ASSERT((lastId < QUE_MAX_EVENT && lastId >= 0), { KERNEL_LOG(KERNEL_ERROR, "current id is %ld, max buffer number in same queue position is %d", lastId, QUE_MAX_EVENT); }); ptr->eventOccupy = sbitset1(ptr->eventOccupy, lastId); return lastId; } template __aicore__ inline void TPipe::ReleaseEventID(TEventID id) { ASCENDC_ASSERT((id >= 0 && id < QUE_MAX_EVENT), { KERNEL_LOG(KERNEL_ERROR, "current id is %d, which should be larger than 0, and smaller than %d", static_cast(id), QUE_MAX_EVENT); }); ASCENDC_ASSERT((evt != HardEvent::MAX), { KERNEL_LOG(KERNEL_ERROR, "evt cannot be HardEvent::MAX"); }); auto ptr = this->g_tpipeImpl.eventPool_ + EventToIndex(evt); ptr->eventOccupy = sbitset0(ptr->eventOccupy, id); return; } __aicore__ inline TEventID TPipe::FetchEventID(HardEvent evt) { auto ptr = this->g_tpipeImpl.eventPool_ + EventToIndex(evt); auto lastId = sff0(ptr->eventOccupy); ASCENDC_ASSERT((lastId < QUE_MAX_EVENT && lastId >= 0), { KERNEL_LOG(KERNEL_ERROR, "current id is %ld, max buffer number in same queue position is %d", lastId, QUE_MAX_EVENT); }); return lastId; } template __aicore__ inline TEventID TPipe::FetchEventID() { auto ptr = this->g_tpipeImpl.eventPool_ + EventToIndex(evt); auto lastId = sff0(ptr->eventOccupy); ASCENDC_ASSERT((lastId < QUE_MAX_EVENT && lastId >= 0), { KERNEL_LOG(KERNEL_ERROR, "current id is %ld, max buffer number in same queue position is %d", lastId, QUE_MAX_EVENT); }); return lastId; } template [[deprecated("NOTICE: GetAbsAddr has been deprecated and will be removed in the next version. " "Please do not use it!")]] __aicore__ inline TBuffAddr TPipe::GetAbsAddr(int32_t offset, int32_t len) const { TBuffAddr addr; addr.logicPos = static_cast(pos); addr.bufferHandle = nullptr; addr.bufferAddr = offset; addr.dataLen = len; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; constexpr auto pool = GetPhyType(pos); ASCENDC_ASSERT((pool != Hardware::GM), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be Hardware::GM"); }); ASCENDC_ASSERT(((offset + len) <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "offset is %d, len is %d, exceed limits %d", offset, len, bufferInitLen.at(pool)); }); auto absAddr = this->g_tpipeImpl.bufPoolBaseAddr_[static_cast(pool)].absAddr; addr.absAddr = absAddr + addr.bufferAddr; #endif return addr; } template [[deprecated("NOTICE: GetAbsAddr has been deprecated and will be removed in the next version. " "Please do not use it!")]] __aicore__ inline __sync_noalias__ LocalTensor TPipe::GetAbsAddr(int32_t offset, int32_t size) const { TBuffAddr addr = GetAbsAddr(offset, static_cast((size * sizeof(T)))); LocalTensor tensor; tensor.SetAddr(addr); return tensor; } __aicore__ inline void TPipe::InitShareBufStart(uint32_t mode, uint32_t* shareLens, uint32_t lens, uint8_t subBlockIdx) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((lens == static_cast(TShareBuf::ShareHard::MAX)), { KERNEL_LOG(KERNEL_ERROR, "lens is %d, which should be %d", lens, static_cast(TShareBuf::ShareHard::MAX)); }); #else (void)(lens); #endif ASCENDC_ASSERT((subBlockIdx == 0 || subBlockIdx == 1), { KERNEL_LOG(KERNEL_ERROR, "subBlockIdx is %d, which should only be 0/1", subBlockIdx); }); AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::L1), Hardware::L1, subBlockIdx); AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::L0C), Hardware::L0C, subBlockIdx); #if __CCE_AICORE__ < 220 AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::UB), Hardware::UB, subBlockIdx); #endif this->g_tpipeImpl.bufPool_[static_cast(Hardware::L0A)].maxAddr = 0; this->g_tpipeImpl.bufPool_[static_cast(Hardware::L0B)].maxAddr = 0; // v100 Shouldn't Use Bias Table this->g_tpipeImpl.bufPool_[static_cast(Hardware::BIAS)].maxAddr = 0; return; } __aicore__ inline void TPipe::InitShareBufEnd() { // debug methods need to be added. this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr = g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::L1)]; this->g_tpipeImpl.bufPool_[static_cast(Hardware::L0C)].maxAddr = g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::L0C)]; #if __CCE_AICORE__ < 220 this->g_tpipeImpl.bufPool_[static_cast(Hardware::UB)].maxAddr = g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::UB)]; #endif return; } __aicore__ inline void InitShareBufStart(TPipe* tpipe, uint32_t mode, uint32_t* shareLens, uint32_t lens, uint8_t subBlockIdx) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((lens == static_cast(TShareBuf::ShareHard::MAX)), { KERNEL_LOG(KERNEL_ERROR, "lens is %d, which should be %d", lens, static_cast(TShareBuf::ShareHard::MAX)); }); #else (void)(lens); #endif ASCENDC_ASSERT((subBlockIdx == 0 || subBlockIdx == 1), { KERNEL_LOG(KERNEL_ERROR, "subBlockIdx is %d, which should only be 0/1", subBlockIdx); }); tpipe->AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::L1), Hardware::L1, subBlockIdx); tpipe->AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::L0C), Hardware::L0C, subBlockIdx); #if __CCE_AICORE__ < 220 tpipe->AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::UB), Hardware::UB, subBlockIdx); #endif tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::L0A)].maxAddr = 0; tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::L0B)].maxAddr = 0; // v100 Shouldn't Use Bias Table tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::BIAS)].maxAddr = 0; return; } __aicore__ inline void InitShareBufEnd(TPipe* tpipe) { // debug methods need to be added. tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr = tpipe->g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::L1)]; tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::L0C)].maxAddr = tpipe->g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::L0C)]; #if __CCE_AICORE__ < 220 tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::UB)].maxAddr = tpipe->g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::UB)]; #endif return; } template __aicore__ inline void TPipe::InitSpmBuffer(const GlobalTensor& workspace, const int32_t bufferSize) { g_tpipeImpl.spmInfo_.spmBuffSize = bufferSize; g_tpipeImpl.spmInfo_.spmAddr = reinterpret_cast(workspace.GetPhyAddr()); g_tpipeImpl.spmInfo_.spmBufType = static_cast(Hardware::GM); } __aicore__ inline void TPipe::InitSpmBuffer(const int32_t bufferSize) { #if __CCE_AICORE__ >= 220 (void)(bufferSize); ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "only support platform ascend910, ascend310p"); }); #else g_tpipeImpl.spmInfo_.spmBuffSize = bufferSize; TQueBind inQueue; constexpr auto pool = GetPhyType(QuePosition::A1); g_tpipeImpl.spmInfo_.spmAddr = g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; #ifdef __CCE_KT_TEST__ auto absAddr = GetBaseAddr(static_cast(TPosition::A1)); g_tpipeImpl.spmInfo_.spmAddr = g_tpipeImpl.spmInfo_.spmAddr + reinterpret_cast(absAddr); #endif InitBuffer(inQueue, 1, bufferSize); g_tpipeImpl.spmInfo_.spmBufType = static_cast(Hardware::L1); #endif } template __aicore__ inline void TPipe::WriteSpmBuffer(const LocalTensor& writeLocal, const DataCopyParams& copyParams, int32_t writeOffset) { /* * before write, the local may come frome MTE2/V, so need insert MTE3 wait V/MTE2 * after write, the local may used to compute or copy out, need insert V/MTE2 wait MTE3 */ event_t eventIDVToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); SetFlag(eventIDVToMTE3); WaitFlag(eventIDVToMTE3); event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); SetFlag(eventIDMTE2ToMTE3); WaitFlag(eventIDMTE2ToMTE3); if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::GM)) { DataCopyUB2GMImpl(reinterpret_cast<__gm__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + writeOffset, reinterpret_cast<__ubuf__ T*>(writeLocal.GetPhyAddr()), copyParams); event_t eventIDMTE3ToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); SetFlag(eventIDMTE3ToMTE2); WaitFlag(eventIDMTE3ToMTE2); } else if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::L1)) { ASCENDC_ASSERT((writeOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "writeOffset is %d, which must be 32B aligned", writeOffset); }); DataCopyUB2L1Impl(reinterpret_cast<__cbuf__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + writeOffset, reinterpret_cast<__ubuf__ T*>(writeLocal.GetPhyAddr()), copyParams); event_t eventIDMTE3ToMTE1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE1)); SetFlag(eventIDMTE3ToMTE1); WaitFlag(eventIDMTE3ToMTE1); } event_t eventIDMTE3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); SetFlag(eventIDMTE3ToV); WaitFlag(eventIDMTE3ToV); } template __aicore__ inline void TPipe::ReadSpmBuffer(const LocalTensor& readLocal, const DataCopyParams& copyParams, int32_t readOffset) { /* * before read, the local may be calculate, so need insert MTE wait V * after read, the local may used to compute or copy out, need insert V/MTE2 wait MTE3 */ if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::GM)) { event_t eventIDVToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2)); event_t eventIDMTE2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); SetFlag(eventIDVToMTE2); WaitFlag(eventIDVToMTE2); DataCopyGM2UBImpl(reinterpret_cast<__ubuf__ T*>(readLocal.GetPhyAddr()), reinterpret_cast<__gm__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + readOffset, copyParams); SetFlag(eventIDMTE2ToV); WaitFlag(eventIDMTE2ToV); SetFlag(eventIDMTE2ToMTE3); WaitFlag(eventIDMTE2ToMTE3); } else if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::L1)) { ASCENDC_ASSERT((readOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "readOffset is %d, which must be 32B aligned", readOffset); }); event_t eventIDVToMTE1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE1)); event_t eventIDMTE1ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_V)); event_t eventIDMTE1ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE3)); SetFlag(eventIDVToMTE1); WaitFlag(eventIDVToMTE1); DataCopyL12UBImpl(reinterpret_cast<__ubuf__ T*>(readLocal.GetPhyAddr()), reinterpret_cast<__cbuf__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + readOffset, copyParams); SetFlag(eventIDMTE1ToV); WaitFlag(eventIDMTE1ToV); SetFlag(eventIDMTE1ToMTE3); WaitFlag(eventIDMTE1ToMTE3); } } template __aicore__ inline void TPipe::WriteSpmBuffer(const LocalTensor& writeLocal, const int32_t writeSize, int32_t writeOffset) { /* * before write, the local may come frome MTE2/V, so need insert MTE3 wait V/MTE2 * after write, the local may used to compute or copy out, need insert V/MTE2 wait MTE3 */ int computeSize = writeSize != 0 ? writeSize : GetShapeSize(writeLocal.GetShapeInfo()); struct DataCopyParams repeatParams; repeatParams.blockLen = computeSize / AscendCUtils::GetC0Count(sizeof(T)); event_t eventIDVToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); event_t eventIDMTE3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); SetFlag(eventIDVToMTE3); WaitFlag(eventIDVToMTE3); SetFlag(eventIDMTE2ToMTE3); WaitFlag(eventIDMTE2ToMTE3); if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::GM)) { DataCopyUB2GMImpl(reinterpret_cast<__gm__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + writeOffset, reinterpret_cast<__ubuf__ T*>(writeLocal.GetPhyAddr()), repeatParams); event_t eventIDMTE3ToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); SetFlag(eventIDMTE3ToMTE2); WaitFlag(eventIDMTE3ToMTE2); } else if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::L1)) { ASCENDC_ASSERT((writeOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "writeOffset is %d, which must be 32B aligned", writeOffset); }); ASCENDC_ASSERT((writeSize % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "writeSize is %d, which must be 32B aligned", writeSize); }); DataCopyUB2L1Impl(reinterpret_cast<__cbuf__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + writeOffset, reinterpret_cast<__ubuf__ T*>(writeLocal.GetPhyAddr()), repeatParams); event_t eventIDMTE3ToMTE1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE1)); SetFlag(eventIDMTE3ToMTE1); WaitFlag(eventIDMTE3ToMTE1); } SetFlag(eventIDMTE3ToV); WaitFlag(eventIDMTE3ToV); } template __aicore__ inline void TPipe::ReadSpmBuffer(const LocalTensor& readLocal, const int32_t readSize, int32_t readOffset) { /* * before read, the local may be calculate, so need insert MTE wait V * after read, the local may used to compute or copy out, need insert V/MTE2 wait MTE3 */ int computeSize = readSize != 0 ? readSize : GetShapeSize(readLocal.GetShapeInfo()); struct DataCopyParams repeatParams; repeatParams.blockLen = computeSize / AscendCUtils::GetC0Count(sizeof(T)); if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::GM)) { event_t eventIDVToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2)); event_t eventIDMTE2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); SetFlag(eventIDVToMTE2); WaitFlag(eventIDVToMTE2); DataCopyGM2UBImpl(reinterpret_cast<__ubuf__ T*>(readLocal.GetPhyAddr()), reinterpret_cast<__gm__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + readOffset, repeatParams); SetFlag(eventIDMTE2ToV); WaitFlag(eventIDMTE2ToV); SetFlag(eventIDMTE2ToMTE3); WaitFlag(eventIDMTE2ToMTE3); } else if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::L1)) { ASCENDC_ASSERT((readOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "readOffset is %d, which must be 32B aligned", readOffset); }); ASCENDC_ASSERT((readSize % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "readSize is %d, which must be 32B aligned", readSize); }); event_t eventIDVToMTE1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE1)); event_t eventIDMTE1ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_V)); event_t eventIDMTE1ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE3)); SetFlag(eventIDVToMTE1); WaitFlag(eventIDVToMTE1); DataCopyL12UBImpl(reinterpret_cast<__ubuf__ T*>(readLocal.GetPhyAddr()), reinterpret_cast<__cbuf__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + readOffset, repeatParams); SetFlag(eventIDMTE1ToV); WaitFlag(eventIDMTE1ToV); SetFlag(eventIDMTE1ToMTE3); WaitFlag(eventIDMTE1ToMTE3); } } template __aicore__ inline uint64_t TPipe::GetQueueEndAddress() { Hardware hardType = GetPhyType(pos); ASCENDC_ASSERT((hardType == Hardware::UB), { KERNEL_LOG(KERNEL_ERROR, "hardType should be UB"); }); return this->g_tpipeImpl.bufPool_[static_cast(hardType)].maxAddr; } __aicore__ inline void TPipe::Destroy() { g_tpipeImpl.isDestroy = true; auto ptr = this->g_tpipeImpl.buf_; for (uint8_t i = 0; i < this->g_tpipeImpl.curBufSize_; i++, ptr++) { if (ptr->freeBufEvtID != INVALID_TEVENTID) { WaitFlagImpl(ptr->freeBufEvt, ptr->freeBufEvtID); ptr->freeBufEvtID = INVALID_TEVENTID; } } // for matmul macro, release M_MTE1 0 1 event id. #if __CCE_AICORE__ == 220 if ASCEND_IS_AIC { WaitFlag(0); ReleaseEventID(0); WaitFlag(1); ReleaseEventID(1); // For Bias WaitFlag(2); ReleaseEventID(2); } #elif __CCE_AICORE__ == 300 WaitFlag(0); ReleaseEventID(0); WaitFlag(1); ReleaseEventID(1); WaitFlag(2); ReleaseEventID(2); #endif pipe_barrier(PIPE_ALL); #if __CCE_AICORE__ == 200 dcci((__gm__ int64_t*)0, cache_line_t::ENTIRE_DATA_CACHE); #endif } __aicore__ inline void TPipe::Reset() { auto ptr = this->g_tpipeImpl.buf_; for (uint8_t i = 0; i < this->g_tpipeImpl.curBufSize_; i++, ptr++) { if (ptr->freeBufEvtID != INVALID_TEVENTID) { WaitFlagImpl(ptr->freeBufEvt, ptr->freeBufEvtID); ptr->freeBufEvtID = INVALID_TEVENTID; } } InitSocState(); ResetPool(); #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 for (int32_t i = 0; i < static_cast(Hardware::MAX); i++) { SetBufferCtx((Hardware)i, &g_tpipeImpl.bufPoolBaseAddr_[i]); } #endif } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 template [[deprecated("NOTICE: GetAbsAddr has been deprecated and will be removed in the next version. " "Please do not use it!")]] inline uint64_t TPipe::GetAbsAddr(const LocalTensor& tensor) { // Translates the CPU address to the actual physical address. // Currently, only L1 or UB address translation is supported. int8_t logicPos = tensor.GetPosition(); auto positionHardMap = ConstDefiner::Instance().positionHardMap; ASCENDC_ASSERT((positionHardMap.find((TPosition)logicPos) != positionHardMap.end()), { KERNEL_LOG(KERNEL_ERROR, "illegal logicPos %d ", static_cast(logicPos)); }); Hardware hardType = positionHardMap.at((TPosition)logicPos); ASCENDC_ASSERT(((hardType == Hardware::UB) || (hardType == Hardware::L1)), { KERNEL_LOG(KERNEL_ERROR, "illegal hardType %d ", static_cast(hardType)); }); uint8_t* phyAddr = reinterpret_cast(tensor.GetPhyAddr()); uint8_t* baseAddr = static_cast(g_tpipeImpl.bufPoolBaseAddr_[static_cast(hardType)].absAddr); ASCENDC_ASSERT((phyAddr >= baseAddr), { KERNEL_LOG(KERNEL_ERROR, "phyAddr is %p, baseAddr is %p, phyAddr shoud be larger than baseAddr", phyAddr, baseAddr); }); uint64_t delta = phyAddr - baseAddr; if (hardType == Hardware::UB) { ASCENDC_ASSERT((delta < TMP_UB_OFFSET), { KERNEL_LOG(KERNEL_ERROR, "addr %lu exceed ub limits %lu ", delta, TMP_UB_OFFSET); }); } else { ASCENDC_ASSERT((delta < TOTAL_L1_SIZE), { KERNEL_LOG(KERNEL_ERROR, "addr %lu exceed l1 limits %lu", delta, TOTAL_L1_SIZE); }); } return delta; } template inline uint64_t GetAbsAddr(TPipe* tpipe, const LocalTensor& tensor) { // Translates the CPU address to the actual physical address. // Currently, only L1 or UB address translation is supported. int8_t logicPos = tensor.GetPosition(); auto positionHardMap = ConstDefiner::Instance().positionHardMap; ASCENDC_ASSERT((positionHardMap.find((TPosition)logicPos) != positionHardMap.end()), { KERNEL_LOG(KERNEL_ERROR, "illegal logicPos %d ", static_cast(logicPos)); }); Hardware hardType = positionHardMap.at((TPosition)logicPos); ASCENDC_ASSERT(((hardType == Hardware::UB) || (hardType == Hardware::L1)), { KERNEL_LOG(KERNEL_ERROR, "illegal hardType %d ", static_cast(hardType)); }); uint8_t* phyAddr = reinterpret_cast(tensor.GetPhyAddr()); uint8_t* baseAddr = static_cast(tpipe->g_tpipeImpl.bufPoolBaseAddr_[static_cast(hardType)].absAddr); ASCENDC_ASSERT((phyAddr >= baseAddr), { KERNEL_LOG(KERNEL_ERROR, "phyAddr is %p, baseAddr is %p, phyAddr shoud be larger than baseAddr", phyAddr, baseAddr); }); uint64_t delta = phyAddr - baseAddr; if (hardType == Hardware::UB) { ASCENDC_ASSERT((delta < TMP_UB_OFFSET), { KERNEL_LOG(KERNEL_ERROR, "addr %lu exceed ub limits %lu ", delta, TMP_UB_OFFSET); }); } else { ASCENDC_ASSERT((delta < TOTAL_L1_SIZE), { KERNEL_LOG(KERNEL_ERROR, "addr %lu exceed l1 limits %lu", delta, TOTAL_L1_SIZE); }); } return delta; } inline uint8_t* TPipe::GetBaseAddr(int8_t logicPos) { auto positionHardMap = ConstDefiner::Instance().positionHardMap; ASCENDC_ASSERT((positionHardMap.find((TPosition)logicPos) != positionHardMap.end()), { KERNEL_LOG(KERNEL_ERROR, "illegal logicPos %d ", int32_t(logicPos)); }); Hardware hardType = positionHardMap.at((TPosition)logicPos); ASCENDC_ASSERT((hardType != Hardware::GM), { KERNEL_LOG(KERNEL_ERROR, "hardware position can not be gm"); }); uint8_t* baseAddr = static_cast(g_tpipeImpl.bufPoolBaseAddr_[static_cast(hardType)].absAddr); return baseAddr; } void inline TPipe::SetBufferCtx(Hardware hard, struct BufPoolExtra* bufPool) { ASCENDC_ASSERT((hard != Hardware::MAX), { KERNEL_LOG(KERNEL_ERROR, "hard type can not be Hardware::MAX"); }); auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((bufferInitLen.find(hard) != bufferInitLen.end()), { KERNEL_LOG(KERNEL_ERROR, "illegal hard type %d", static_cast(hard)); }); uint8_t* ptr; if (hard == Hardware::GM) { ptr = ConstDefiner::Instance().cpuGM; } else { ptr = ConstDefiner::Instance().hardwareCpuBufferMap.at(hard); } { // init memory with random value std::default_random_engine e; int32_t* p = reinterpret_cast(ptr); for (uint64_t i = 0; i < bufferInitLen.at(hard) / sizeof(int32_t); i++) { p[i] = e(); } } bufPool->phySpace = bufferInitLen.at(hard); bufPool->absAddr = ptr; return; } #endif __aicore__ inline void TPipe::AuxShareBufStart(uint32_t mode, uint32_t* shareLens, uint8_t pos, Hardware hard, uint8_t subBlockIdx) { uint8_t hardU8 = static_cast(hard); if (unlikely(g_tpipeImpl.shareBufPool_.start[pos] == -1)) { // The address has not been initialized. // Record the maximum allocated address. g_tpipeImpl.shareBufPool_.start[pos] = this->g_tpipeImpl.bufPool_[hardU8].maxAddr; g_tpipeImpl.shareBufPool_.maxAddr[pos] = g_tpipeImpl.shareBufPool_.start[pos] + shareLens[pos]; DEBUG_CODE(g_tpipeImpl.shareBufPool_.length[pos] = shareLens[pos]); } else { DEBUG_CODE(g_tpipeImpl.shareBufPool_.length[pos] = g_tpipeImpl.shareBufPool_.length[pos] > shareLens[pos] ? g_tpipeImpl.shareBufPool_.length[pos] : shareLens[pos]); // Record the maximum allocated address. g_tpipeImpl.shareBufPool_.maxAddr[pos] = this->g_tpipeImpl.bufPool_[hardU8].maxAddr; g_tpipeImpl.bufPool_[hardU8].maxAddr = g_tpipeImpl.shareBufPool_.start[pos]; // Reset resource start position. } if (mode == 1 && subBlockIdx == 1) { this->g_tpipeImpl.bufPool_[hardU8].maxAddr += shareLens[pos] / HALF_FACTOR; // Reset resource start position. } ASCENDC_ASSERT((g_tpipeImpl.shareBufPool_.length[pos] >= shareLens[pos]), { KERNEL_LOG(KERNEL_ERROR, "share buf addr is %d, exceed limits %d", shareLens[pos], g_tpipeImpl.shareBufPool_.length[pos]); }); } __aicore__ inline void TPipe::InitSocState() const { set_atomic_none(); #if __CCE_AICORE__ == 220 if ASCEND_IS_AIC { set_mask_norm(); set_l1_3d_size(static_cast(0)); set_padding(static_cast(0)); } else { set_vector_mask(static_cast(-1), static_cast(-1)); set_mask_norm(); } #elif __CCE_AICORE__ == 300 set_padding(static_cast(0)); #endif } __aicore__ inline void TPipe::ResetPool() { g_tpipeImpl.tscmBufferPtr_ = TOTAL_L1_SIZE; g_tpipeImpl.curBufSize_ = 0; auto buf = g_tpipeImpl.bufPool_; for (int32_t i = 0; i < static_cast(Hardware::MAX); i++, buf++) { buf->maxAddr = 0; } auto evt = g_tpipeImpl.eventPool_; for (int32_t i = 0; i < EVENT_NUM; i++, evt++) { evt->eventOccupy = 0; } g_tpipeImpl.shareBufPool_.start[static_cast(TShareBuf::ShareHard::L1)] = -1; g_tpipeImpl.shareBufPool_.start[static_cast(TShareBuf::ShareHard::UB)] = -1; g_tpipeImpl.shareBufPool_.start[static_cast(TShareBuf::ShareHard::L0C)] = -1; } template __aicore__ inline bool TPipe::TscmInitBuffer(T& que, uint8_t num, uint32_t len) { ASCENDC_ASSERT(((num * len) < TOTAL_L1_SIZE), { KERNEL_LOG(KERNEL_ERROR, "tscm buffer length is %u bytes, which is larger than total l1 size %u bytes", len * num, TOTAL_L1_SIZE); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; que.value = num; que.bufStart = this->g_tpipeImpl.buf_ + this->g_tpipeImpl.curBufSize_; DEBUG_CODE(que.bufLen = num * len); // Assign l1 constexpr Hardware pool = Hardware::L1; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((num * len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer length %d is too large, the limit is %d", num * len, bufferInitLen.at(pool)); }); #endif uint32_t curPoolAddr; if constexpr (T::scmBlockGroup) { curPoolAddr = g_tpipeImpl.tscmBufferPtr_ - num * len; g_tpipeImpl.tscmBufferPtr_ -= num * len; } else { curPoolAddr = g_tpipeImpl.tscmBufferPtr_ - (GetTaskRationImpl() - GetSubBlockIdxImpl()) * len * num; g_tpipeImpl.tscmBufferPtr_ -= GetTaskRationImpl() * num * len; } auto ptr = que.bufStart; for (int32_t i = 0; i < num; i++, ptr++) { ptr->state = TBufState::FREE; ptr->freeBufEvt = T::freeBufEvt; ptr->enQueEvtID = INVALID_TEVENTID; ptr->freeBufEvtID = INVALID_TEVENTID; ptr->address = curPoolAddr; ptr->dataLen = len; ptr->usertag = -1; curPoolAddr += len; } ASCENDC_ASSERT( (this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr <= this->g_tpipeImpl.tscmBufferPtr_), { KERNEL_LOG(KERNEL_ERROR, "tscm addr %d overlapped with maxAddr %d", this->g_tpipeImpl.tscmBufferPtr_, this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr); }); this->g_tpipeImpl.curBufSize_ += num; ASCENDC_ASSERT((this->g_tpipeImpl.curBufSize_ <= QBUF_MAX_LEN), { KERNEL_LOG(KERNEL_ERROR, "max buffer num is %d, current buf size %d exceed this limits", QBUF_MAX_LEN, this->g_tpipeImpl.curBufSize_); }); return true; } // begin impl of tBufPool template __aicore__ inline TBufPool::TBufPool() { Init(); } template __aicore__ inline TBufPool::~TBufPool() { auto ptr = this->g_tBufPoolImpl.buf_; for (uint8_t i = 0; i < this->g_tBufPoolImpl.curBufSize_; i++, ptr++) { if (ptr->freeBufEvtID != INVALID_TEVENTID) { WaitFlagImpl(ptr->freeBufEvt, ptr->freeBufEvtID); ptr->freeBufEvtID = INVALID_TEVENTID; } } ResetPool(); }; template __aicore__ inline void TBufPool::ResetPool() { g_tBufPoolImpl.curBufSize_ = 0; g_tBufPoolImpl.startAddr_ = 0; g_tBufPoolImpl.maxAddr_ = 0; g_tBufPoolImpl.maxLen_ = 0; } template __aicore__ inline void TBufPool::Init() { constexpr auto pool = GetPhyType(pos); static_assert((pool == Hardware::L1 || pool == Hardware::UB), "TbufPool Position should be one of A1/B1/C1/VECIN/VECOUT/VECCALC"); ResetPool(); g_tBufPoolImpl.isReset_ = true; } template __aicore__ inline void TBufPool::Reset() { auto ptr = this->g_tBufPoolImpl.buf_; for (uint8_t i = 0; i < this->g_tBufPoolImpl.curBufSize_; i++, ptr++) { if (ptr->freeBufEvtID != INVALID_TEVENTID) { WaitFlagImpl(ptr->freeBufEvt, ptr->freeBufEvtID); ptr->freeBufEvtID = INVALID_TEVENTID; } } ResetPool(); g_tBufPoolImpl.isReset_ = true; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 AscendCUpdateTbufPoolStatus(reinterpret_cast(&g_tBufPoolImpl), g_tBufPoolImpl.isReset_); #endif } template template __aicore__ inline bool TBufPool::InitBuffer(T &que, uint8_t num, uint32_t len) { static_assert((T::isTQue), "TBufPool::InitBuffer(T& que, uint8_t num, uint32_t len) not supports T as TBuf"); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; que.value = num; que.bufStart = this->g_tBufPoolImpl.buf_ + this->g_tBufPoolImpl.curBufSize_; DEBUG_CODE(que.bufLen = num * len); ASCENDC_ASSERT( (this->g_tBufPoolImpl.maxAddr_ + num * len <= this->g_tBufPoolImpl.startAddr_ + this->g_tBufPoolImpl.maxLen_), { KERNEL_LOG(KERNEL_ERROR, "Buffer Init length exceeds limit of BufPool. Max Length of BufPool is %u", this->g_tBufPoolImpl.maxLen_); }); auto curPoolAddr = this->g_tBufPoolImpl.maxAddr_; auto ptr = que.bufStart; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 Hardware pool = GetBufferPos(T::srcPosition, T::dstPosition); ASCENDC_ASSERT( (pool == GetPhyType(pos)), { KERNEL_LOG(KERNEL_ERROR, "buffer pos should be same with pos of TbufPool"); }); auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((num * len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", num * len, bufferInitLen.at(pool)); }); auto bufPos = GetPosition(T::srcPosition, T::dstPosition); auto absAddr = GetTPipePtr()->GetBaseAddr(static_cast(bufPos)); AscendCBufInit(static_cast(bufPos), 0, num, reinterpret_cast(curPoolAddr + absAddr), len); que.SetTBufPoolHandle(reinterpret_cast(&g_tBufPoolImpl)); ASCENDC_ASSERT((curPoolAddr + num * len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, limits is %d", curPoolAddr, bufferInitLen.at(pool)); }); #endif for (int32_t i = 0; i < num; i++, ptr++) { ptr->state = TBufState::FREE; ptr->freeBufEvt = T::freeBufEvt; ptr->enQueEvtID = INVALID_TEVENTID; ptr->freeBufEvtID = INVALID_TEVENTID; ptr->address = curPoolAddr; ptr->dataLen = len; ptr->usertag = -1; curPoolAddr += len; } this->g_tBufPoolImpl.maxAddr_ = curPoolAddr; this->g_tBufPoolImpl.curBufSize_ += num; ASCENDC_ASSERT((this->g_tBufPoolImpl.curBufSize_ <= QBUFPOOL_MAX_LEN), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, limits is %d", this->g_tBufPoolImpl.curBufSize_, QBUFPOOL_MAX_LEN); }); return true; } template template __aicore__ inline bool TBufPool::InitBuffer(TBuf &buf, uint32_t len) { ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr int32_t bufHandleSize = 1; buf.bufStart = this->g_tBufPoolImpl.buf_ + this->g_tBufPoolImpl.curBufSize_; buf.bufLen = len; buf.offset = 0; ASCENDC_ASSERT( (this->g_tBufPoolImpl.maxAddr_ + len <= this->g_tBufPoolImpl.startAddr_ + this->g_tBufPoolImpl.maxLen_), { KERNEL_LOG(KERNEL_ERROR, "Buffer Init length exceeds limit of BufPool. Max Length of BufPool is %u", this->g_tBufPoolImpl.maxLen_); }); constexpr auto pool = GetPhyType(bufPos); ASCENDC_ASSERT((GetPhyType(bufPos) == GetPhyType(pos)), { KERNEL_LOG(KERNEL_ERROR, "buffer pos should be same with pos of TBufPool"); }); auto curPoolAddr = this->g_tBufPoolImpl.maxAddr_; auto ptr = buf.bufStart; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "len is %u, exceed limits %d", len, bufferInitLen.at(pool)); }); auto absAddr = GetTPipePtr()->GetBaseAddr(static_cast(bufPos)); AscendCBufInit(static_cast(bufPos), 1, 1, reinterpret_cast(curPoolAddr + absAddr), len); buf.SetTBufPoolHandle(reinterpret_cast(&g_tBufPoolImpl)); #endif for (uint8_t i = 0; i < bufHandleSize; i++, ptr++) { ptr->state = TBufState::FREE; ptr->enQueEvtID = INVALID_TEVENTID; ptr->freeBufEvtID = INVALID_TEVENTID; ptr->address = curPoolAddr; ptr->dataLen = len; ptr->usertag = -1; curPoolAddr += len; } ASCENDC_ASSERT((curPoolAddr <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, exceed limits %d", curPoolAddr, bufferInitLen.at(pool)); }); this->g_tBufPoolImpl.maxAddr_ = curPoolAddr; this->g_tBufPoolImpl.curBufSize_ += bufHandleSize; ASCENDC_ASSERT((this->g_tBufPoolImpl.curBufSize_ <= QBUFPOOL_MAX_LEN), { KERNEL_LOG(KERNEL_ERROR, "current total buffer num is %d, exceed limits %d", this->g_tBufPoolImpl.curBufSize_, QBUFPOOL_MAX_LEN); }); return true; } template template __aicore__ inline bool TBufPool::InitBufPool(T &bufPool, uint32_t len) { static_assert( (T::isTbufPool), "TBufPool::InitBufPool(T& bufPool, uint32_t len, U& shareBuf) only supports T as TbufPool"); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr auto pool = GetPhyType(T::poolPos); bufPool.g_tBufPoolImpl.startAddr_ = this->g_tBufPoolImpl.maxAddr_; bufPool.g_tBufPoolImpl.maxAddr_ = bufPool.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxLen_ = len; ASCENDC_ASSERT( (this->g_tBufPoolImpl.maxAddr_ + len <= this->g_tBufPoolImpl.startAddr_ + this->g_tBufPoolImpl.maxLen_), { KERNEL_LOG(KERNEL_ERROR, "Buffer Init length exceeds limit of BufPool. Max Length of BufPool is %u", this->g_tBufPoolImpl.maxLen_); }); auto curPoolAddr = this->g_tBufPoolImpl.maxAddr_; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", len, bufferInitLen.at(pool)); }); auto bufPos = T::poolPos; auto absAddr = GetTPipePtr()->GetBaseAddr(static_cast(bufPos)); AscendCTBufPoolInit(static_cast(bufPos), reinterpret_cast(curPoolAddr + absAddr), len, reinterpret_cast(&bufPool.g_tBufPoolImpl)); AscendCRecordPoolHierarchy( reinterpret_cast(&this->g_tBufPoolImpl), reinterpret_cast(&bufPool.g_tBufPoolImpl)); #endif curPoolAddr += len; ASCENDC_ASSERT((curPoolAddr <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, limits is %d", curPoolAddr, bufferInitLen.at(pool)); }); this->g_tBufPoolImpl.maxAddr_ = curPoolAddr; return true; } template template __aicore__ inline bool TBufPool::InitBufPool(T &bufPool, uint32_t len, U &shareBuf) { static_assert((T::isTbufPool && U::isTbufPool), "TBufPool::InitBufPool(T& bufPool, uint32_t len, U& shareBuf) only supports T and U as TBufPool"); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr auto pool = GetPhyType(T::poolPos); constexpr auto sharedPool = GetPhyType(U::poolPos); ASCENDC_ASSERT((pool == sharedPool), { KERNEL_LOG(KERNEL_ERROR, "Position of input bufPool should be same with position of shareBuf"); }); bufPool.g_tBufPoolImpl.startAddr_ = shareBuf.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxAddr_ = bufPool.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxLen_ = shareBuf.g_tBufPoolImpl.maxLen_; ASCENDC_ASSERT((len <= shareBuf.g_tBufPoolImpl.maxLen_), { KERNEL_LOG(KERNEL_ERROR, "Length of input bufPool should be no longer than length of shareBuf, which is %u", shareBuf.g_tBufPoolImpl.maxLen_); }); #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", len, bufferInitLen.at(pool)); }); auto bufPos = T::poolPos; auto absAddr = GetTPipePtr()->GetBaseAddr(static_cast(bufPos)); AscendCTBufPoolInit(static_cast(bufPos), reinterpret_cast(bufPool.g_tBufPoolImpl.startAddr_ + absAddr), len, reinterpret_cast(&bufPool.g_tBufPoolImpl)); AscendCRecordPoolHierarchy( reinterpret_cast(&this->g_tBufPoolImpl), reinterpret_cast(&bufPool.g_tBufPoolImpl)); #endif return true; } } #endif // ASCENDC_MODULE_TPIPE_INTERFACE_IMPL_H