/** * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /*! * \file kernel_tpipe_interface_impl.h * \brief */ #ifndef ASCENDC_MODULE_TPIPE_INTERFACE_IMPL_H #define ASCENDC_MODULE_TPIPE_INTERFACE_IMPL_H #include "kernel_tpipe.h" namespace AscendC { template __aicore__ inline __in_pipe__(V) void NopInPipeV(const T &tensor) { (void)(0); } template __aicore__ inline __out_pipe__(V) void NopOutPipeV(const T &tensor) { (void)(0); } // begin impl of IsAivTscm used by tquebind __aicore__ inline constexpr bool IsAivTscm(TPosition src, TPosition dst) { #if __CCE_AICORE__ == 220 if (GetPosition(src, dst) == TPosition::TSCM) { return true; } #else (void)(src); (void)(dst); #endif return false; } // begin impl of tquebind // TQueBind : this is used for off-standard queue template __aicore__ inline TQueBind::TQueBind() { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 AscendCQueCreate(static_cast(src), static_cast(dst), depth); #endif // __CCE_KT_TEST__ } template template __aicore__ inline __sync_noalias__ LocalTensor TQueBind::AllocTensor() { auto buf = AllocBuffer(); return Buf2Tensor(buf); } template template __aicore__ inline void TQueBind::FreeTensor(LocalTensor& tensor) { FreeBuffer(tensor.GetBufferHandle()); return; } template template __aicore__ inline bool TQueBind::EnQue(const LocalTensor& tensor) { if constexpr (GetPhyType(src) == Hardware::UB || GetPhyType(dst) == Hardware::UB) { NopInPipeV>(tensor); } auto buf = tensor.GetBufferHandle(); return EnQue(reinterpret_cast(buf)); } template __aicore__ inline bool TQueBind::EnQue(TBufHandle buf) { ASCENDC_ASSERT((this->usedCount < depth), { KERNEL_LOG(KERNEL_ERROR, "usedCount is %d, which exceed depth limits %d", static_cast(usedCount), depth); }); auto ptr = reinterpret_cast(buf); if constexpr (depth == 1) { this->que_ = buf; } else { this->que_[this->tail] = buf; } this->usedCount++; ASCENDC_ASSERT((this->bufStart <= ptr && ptr < this->bufStart + this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "ptr is %p, which should be in range [%p, %p]", ptr, this->bufStart, this->bufStart + this->bufNum); }); ASCENDC_ASSERT((ptr->state == TBufState::OCCUPIED) || (ptr->state == TBufState::DEQUE), { KERNEL_LOG(KERNEL_ERROR, "ptr state is %d, which should be OCCUPIED / DEQUE", static_cast(ptr->state)); }); DEBUG_CODE(ptr->state = TBufState::ENQUE); /* Add for TSCM * for 220, aiv just send message, no need add this set/wait */ #if __CCE_AICORE__ == 220 // If the AIC is not entered, the AIV does not process any event ID. if (g_coreType != AIV || (GetPosition(src, dst) != TPosition::TSCM)) { auto enQueEvtID = GetTPipePtr()->AllocEventID(); SetFlag(enQueEvtID); ptr->enQueEvtID = enQueEvtID; } #else auto enQueEvtID = GetTPipePtr()->AllocEventID(); SetFlag(enQueEvtID); ptr->enQueEvtID = enQueEvtID; #endif if constexpr (depth != 1) { if (++this->tail >= depth) { this->tail = 0; } } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; AscendCBufEnque(static_cast(src), static_cast(dst), static_cast(GetPosition(src, dst)), reinterpret_cast(absAddr + ptr->address)); #endif // __CCE_KT_TEST__ return true; } template template __aicore__ inline LocalTensor TQueBind::DeQue() { auto buf = DeQue(); auto ret = Buf2Tensor(buf); if constexpr (GetPhyType(src) == Hardware::UB || GetPhyType(dst) == Hardware::UB) { NopOutPipeV>(ret); } return ret; } template __aicore__ inline TBufHandle TQueBind::DeQue() { TBufHandle buf; if constexpr (depth == 1) { buf = this->que_; } else { buf = this->que_[this->head]; } ASCENDC_ASSERT((buf != nullptr), { KERNEL_LOG(KERNEL_ERROR, "buf can not be nullptr"); }); auto ptr = reinterpret_cast(buf); ASCENDC_ASSERT((ptr->state == TBufState::ENQUE), { KERNEL_LOG(KERNEL_ERROR, "ptr state is %d, which can only be ENQUE", static_cast(ptr->state)); }); ASCENDC_ASSERT((this->usedCount > 0), { KERNEL_LOG(KERNEL_ERROR, "usedCount is %d, which can only larger than 0", static_cast(this->usedCount)); }); this->usedCount--; /* Add for TSCM * for 220, aiv just send message, no need add this set/wait */ DEBUG_CODE(ptr->state = TBufState::DEQUE); #if __CCE_AICORE__ == 220 if (g_coreType != AIV || (GetPosition(src, dst) != TPosition::TSCM)) { if (ptr->enQueEvtID != INVALID_TEVENTID) { WaitFlag(ptr->enQueEvtID); GetTPipePtr()->ReleaseEventID(ptr->enQueEvtID); ptr->enQueEvtID = INVALID_TEVENTID; } } #else if (ptr->enQueEvtID != INVALID_TEVENTID) { WaitFlag(ptr->enQueEvtID); GetTPipePtr()->ReleaseEventID(ptr->enQueEvtID); ptr->enQueEvtID = INVALID_TEVENTID; } #endif if constexpr (depth != 1) { if (++this->head >= depth) { this->head = 0; } } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; AscendCBufDeque(static_cast(src), static_cast(dst), static_cast(GetPosition(src, dst)), (uint64_t)(absAddr + ptr->address)); #endif // __CCE_KT_TEST__ return reinterpret_cast(buf); } template __aicore__ inline void TQueBind::FreeBuffer(TBufHandle buf) { auto ptr = reinterpret_cast(buf); ASCENDC_ASSERT((this->bufStart <= ptr && ptr < this->bufStart + this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "ptr is %p, which should be in range [%p, %p]", ptr, this->bufStart, this->bufStart + this->bufNum); }); ASCENDC_ASSERT((ptr->state != TBufState::FREE), { KERNEL_LOG(KERNEL_ERROR, "ptr state is %d, which can not be FREE", static_cast(ptr->state)); }); if constexpr (!IsAivTscm(src, dst)) { if constexpr (src == TPosition::C1 || (src == TPosition::CO2 && dst == TPosition::VECIN)) { SetFlag(0); // insert pipe_v without eventID ASCENDC_ASSERT((ptr->freeBufEvtID == INVALID_TEVENTID), { KERNEL_LOG(KERNEL_ERROR, "freebuf event id can not be -1"); }); } else { ptr->freeBufEvtID = GetTPipePtr()->AllocEventID(); SetFlag(ptr->freeBufEvtID); } } ptr->state = TBufState::FREE; this->bufUsedCount--; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; AscendCBufFree(static_cast(bufferType), static_cast(GetPosition(src, dst)), (uint64_t)(absAddr + ptr->address), static_cast(ptr->dataLen)); #endif // __CCE_KT_TEST__ return; } template __aicore__ inline void TQueBind::SetTBufPoolHandle(uint64_t bufPoolHandle) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 this->bufPoolHandle = bufPoolHandle; #else (void)(bufPoolHandle); #endif } template __aicore__ inline TBufHandle TQueBind::AllocBuffer() { DEBUG_CODE(int32_t size = 0); ASCENDC_ASSERT((bufNum > 0), { KERNEL_LOG(KERNEL_ERROR, "bufNum is %d, which must be larger than 0", static_cast(bufNum)); }); TBufType* ret; do { ret = this->bufStart + this->bufCursor; if constexpr (config.bufferNumber != 1) { this->bufCursor += 1; if (this->bufCursor == this->bufNum) { this->bufCursor = 0; } } if (ret->state == TBufState::FREE) { ret->state = TBufState::OCCUPIED; if constexpr (IsAivTscm(src, dst)) { break; } if constexpr (src == TPosition::C1) { if (ret->freeBufEvtID != INVALID_TEVENTID) { WaitFlag(ret->freeBufEvtID); GetTPipePtr()->ReleaseEventID(ret->freeBufEvtID); ret->freeBufEvtID = INVALID_TEVENTID; } } else { if (ret->freeBufEvtID != INVALID_TEVENTID) { WaitFlag(ret->freeBufEvtID); GetTPipePtr()->ReleaseEventID(ret->freeBufEvtID); ret->freeBufEvtID = INVALID_TEVENTID; } } break; } ASCENDC_ASSERT((++size <= this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "size is %d, which exceed limits %d", size, static_cast(this->bufNum)); }); } while (true); this->bufUsedCount++; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; AscendCBufAlloc(static_cast(bufferType), static_cast(GetPosition(src, dst)), reinterpret_cast(absAddr + ret->address), static_cast(ret->dataLen)); if (this->bufPoolHandle != 0U) { AscendCUpdateTbufPoolStatus(this->bufPoolHandle, false); AscendCTBufPoolResetCheck(static_cast(GetPosition(srcPosition, dstPosition)), reinterpret_cast(absAddr + ret->address), static_cast(ret->dataLen), this->bufPoolHandle); } #endif // __CCE_KT_TEST__ return reinterpret_cast(ret); } template __aicore__ inline bool TQueBind::VacantInQue() { return usedCount < depth; } template __aicore__ inline bool TQueBind::HasTensorInQue() { return usedCount; } template __aicore__ inline int32_t TQueBind::GetTensorCountInQue() { return usedCount; } template __aicore__ inline bool TQueBind::HasIdleBuffer() { return bufUsedCount < bufNum; } template __aicore__ inline void TQueBind::FreeAllEvent() { auto ptr = this->bufStart; for (int i = 0; i < this->bufNum; i++, ptr++) { // should be in deque status ASCENDC_ASSERT((ptr->enQueEvtID == INVALID_TEVENTID), { KERNEL_LOG(KERNEL_ERROR, "enque event id can not be -1"); }); if (ptr->freeBufEvtID != INVALID_TEVENTID) { WaitFlag(ptr->freeBufEvtID); GetTPipePtr()->ReleaseEventID(ptr->freeBufEvtID); ptr->freeBufEvtID = INVALID_TEVENTID; } } } template __aicore__ inline TBuffAddr TQueBind::GetBufferAddr(TBufHandle buf) { ASCENDC_ASSERT((GetPosition(src, dst) != TPosition::GM), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be GM"); }); auto ptr = reinterpret_cast(buf); ASCENDC_ASSERT((this->bufStart <= ptr && ptr < this->bufStart + this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "ptr is %p, which should be in range [%p, %p]", ptr, this->bufStart, this->bufStart + this->bufNum); }); TBuffAddr addr; addr.logicPos = static_cast(GetPosition(src, dst)); addr.bufferHandle = buf; addr.bufferAddr = ptr->address; addr.dataLen = ptr->dataLen; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 constexpr Hardware bufferType = GetBufferPos(src, dst); auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(bufferType)].absAddr; addr.absAddr = absAddr + addr.bufferAddr; #endif return addr; } template template __aicore__ inline TBufState TQueBind::GetState(const LocalTensor& tensor) const { return GetState(tensor.GetBufferHandle()); } template template __aicore__ inline __sync_noalias__ LocalTensor TQueBind::Buf2Tensor(TBufHandle buf) { TBuffAddr addr = GetBufferAddr(buf); LocalTensor tensor; tensor.SetAddr(addr); return tensor; } template __aicore__ inline TBufState TQueBind::GetState(const TBufHandle& handle) const { if (handle == nullptr) { return TBufState::FREE; } auto ptr = reinterpret_cast(handle); ASCENDC_ASSERT((this->bufStart <= ptr && ptr < this->bufStart + this->bufNum), { KERNEL_LOG(KERNEL_ERROR, "ptr is %p, which should be in range [%p, %p]", ptr, this->bufStart, this->bufStart + this->bufNum); }); return ptr->state; } // begin impl of tbuf template template __aicore__ inline __sync_noalias__ LocalTensor TBuf::Get(uint32_t len) { uint32_t dataLen; if constexpr (IsSameType::value) { dataLen = len / INT4_TWO; } else { dataLen = len * sizeof(T); } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); ASCENDC_ASSERT((dataLen % 32 == 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be times of 32 Bytes", len); }); ASCENDC_ASSERT((dataLen <= bufLen), { KERNEL_LOG(KERNEL_ERROR, "len is %u, max buffer len is %u", dataLen, bufLen); }); #endif auto ptr = this->bufStart; ptr->dataLen = dataLen; TBuffAddr addr; addr.logicPos = static_cast(pos); addr.bufferHandle = reinterpret_cast(ptr); addr.bufferAddr = ptr->address; addr.dataLen = ptr->dataLen; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(GetPhyType(pos))].absAddr; addr.absAddr = absAddr + addr.bufferAddr; AscendCBufGet(addr.logicPos, static_cast(GetPhyType(pos)), reinterpret_cast(addr.absAddr), len); if (this->bufPoolHandle != 0U) { AscendCUpdateTbufPoolStatus(this->bufPoolHandle, false); AscendCTBufPoolResetCheck(static_cast(GetPhyType(pos)), reinterpret_cast(absAddr + ptr->address), static_cast(ptr->dataLen), this->bufPoolHandle); } #endif LocalTensor tensor; tensor.SetAddr(addr); return tensor; } template template __aicore__ inline __sync_noalias__ LocalTensor TBuf::Get() { if constexpr (IsSameType::value) { return Get(bufLen * INT4_TWO); } else { return Get(bufLen / sizeof(T)); } } template template __aicore__ inline __sync_noalias__ LocalTensor TBuf::GetWithOffset(uint32_t size, uint32_t bufOffset) { auto ptr = this->bufStart; ptr->dataLen = size * sizeof(T); TBuffAddr addr; addr.logicPos = static_cast(pos); addr.bufferHandle = reinterpret_cast(ptr); addr.bufferAddr = ptr->address + bufOffset; addr.dataLen = ptr->dataLen; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto absAddr = GetTPipePtr()->GetBaseAddr(static_cast(pos)); addr.absAddr = absAddr + addr.bufferAddr; #endif LocalTensor tensor; tensor.SetAddr(addr); return tensor; } template __aicore__ inline void TBuf::SetTpipeBuf(TBufType* bufStartIn, uint32_t bufLenIn) { this->bufStart = bufStartIn; this->bufLen = bufLenIn; this->offset = 0; } template template __aicore__ inline void TBuf::EnQue(const LocalTensor& tensor) { (void)(0); } template template __aicore__ inline LocalTensor TBuf::DeQue() { return Get(); } template template __aicore__ inline __sync_noalias__ LocalTensor TBuf::AllocTensor() { return Get(); } template template __aicore__ inline void TBuf::FreeTensor(LocalTensor& tensor) { (void)(0); } template template __aicore__ inline TBufState TBuf::GetState(const LocalTensor& tensor) const { TBufHandle handle = tensor.GetBufferHandle(); if (handle == nullptr) { return TBufState::FREE; } auto ptr = reinterpret_cast(handle); return ptr->state; } template __aicore__ inline bool TBuf::EnQue(TBufHandle buf) { return true; } template __aicore__ inline TBufHandle TBuf::DeQue() { return Get(); } template __aicore__ inline TBufHandle TBuf::AllocBuffer() { return Get(); } template __aicore__ inline void TBuf::FreeBuffer(TBufHandle buf) { (void)(0); } template __aicore__ inline TBuffAddr TBuf::GetBufferAddr(TBufHandle buf) { auto ptr = reinterpret_cast(buf); TBuffAddr addr; addr.logicPos = static_cast(pos); addr.bufferHandle = buf; addr.bufferAddr = ptr->address; addr.dataLen = ptr->dataLen; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto absAddr = GetTPipePtr()->g_tpipeImpl.bufPoolBaseAddr_[static_cast(GetPhyType(pos))].absAddr; addr.absAddr = absAddr + addr.bufferAddr; #endif return addr; } template __aicore__ inline TBufHandle TBuf::Get(uint32_t len) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((len <= bufLen), { KERNEL_LOG(KERNEL_ERROR, "len is %u, max buffer len is %u", len, bufLen); }); #endif this->bufStart->dataLen = len; return reinterpret_cast(this->bufStart); } template __aicore__ inline TBufHandle TBuf::Get() { return Get(bufLen); } template __aicore__ inline uint32_t TBuf::GetBufLen() const { return bufLen; } // begin impl of tpipe __aicore__ inline TPipe::TPipe() { InitSocState(); Init(); } __aicore__ inline TPipe::~TPipe() { if (g_tpipeImpl.isDestroy) { return; } Destroy(); }; __aicore__ inline void TPipe::Init() { ResetPool(); // for matmul macro, set flag M_MTE1 at the begining of operator, and also wait flag at the end. // matmul macro only use M_MTE1 event id 0 1 currently. #if __CCE_AICORE__ == 220 if ASCEND_IS_AIC { auto enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 0), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 0"); }); SetFlag(static_cast(enQueEvtID)); enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 1), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 1"); }); SetFlag(static_cast(enQueEvtID)); // For load Bias enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 2), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 2"); }); SetFlag(static_cast(enQueEvtID)); } #elif __CCE_AICORE__ == 300 auto enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 0), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 0"); }); SetFlag(static_cast(enQueEvtID)); enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 1), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 1"); }); SetFlag(static_cast(enQueEvtID)); // For load Bias enQueEvtID = this->AllocEventID(); ASCENDC_ASSERT((enQueEvtID == 2), { KERNEL_LOG(KERNEL_ERROR, "enQueEvtID should be 2"); }); SetFlag(static_cast(enQueEvtID)); #endif #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 for (int32_t i = 0; i < static_cast(Hardware::MAX); i++) { SetBufferCtx((Hardware)i, &g_tpipeImpl.bufPoolBaseAddr_[i]); } auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; AscendCBufAbsAddr(uint8_t(Hardware::UB), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuUB)), bufferInitLen.at(Hardware::UB)); AscendCBufAbsAddr(uint8_t(Hardware::L1), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuL1)), bufferInitLen.at(Hardware::L1)); AscendCBufAbsAddr(uint8_t(Hardware::L0A), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuL0A)), bufferInitLen.at(Hardware::L0A)); AscendCBufAbsAddr(uint8_t(Hardware::L0B), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuL0B)), bufferInitLen.at(Hardware::L0B)); AscendCBufAbsAddr(uint8_t(Hardware::L0C), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuL0C)), bufferInitLen.at(Hardware::L0C)); AscendCBufAbsAddr(uint8_t(Hardware::BIAS), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuBIAS)), bufferInitLen.at(Hardware::BIAS)); AscendCBufAbsAddr(uint8_t(Hardware::FIXBUF), static_cast(reinterpret_cast(ConstDefiner::Instance().cpuFIXBUF)), bufferInitLen.at(Hardware::FIXBUF)); #endif #if __CCE_AICORE__ == 220 #ifdef __DAV_C220_CUBE__ g_cubeTPipePtr = this; #elif defined(__DAV_C220_VEC__) g_vecTPipePtr = this; #else g_tPipePtr = this; #endif #else g_tPipePtr = this; #endif g_tpipeImpl.isDestroy = false; } template __aicore__ inline bool TPipe::InitBuffer(T& que, uint8_t num, uint32_t len) { static_assert((T::isTQue), "TPipe::InitBuffer(T& que, uint8_t num, uint32_t len) not supports T as TBuf"); ASCENDC_ASSERT((que.config.bufferNumber == 0 || que.config.bufferNumber == num), { KERNEL_LOG( KERNEL_ERROR, "buffer number is %u, which shoud be the same as TQueConfig::bufferNumber(%u)", num, que.config.bufferNumber); }); ASCENDC_ASSERT((que.config.bufferLen == 0 || que.config.bufferLen == len), { KERNEL_LOG( KERNEL_ERROR, "buffer length is %u, which shoud be the same as TQueConfig::bufferLen(%u)", len, que.config.bufferLen); }); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); if constexpr (T::dstPosition == TPosition::TSCM) { return TscmInitBuffer(que, num, len); } len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; que.value = num; que.bufStart = this->g_tpipeImpl.buf_ + this->g_tpipeImpl.curBufSize_; DEBUG_CODE(que.bufLen = num * len); Hardware pool = GetBufferPos(T::srcPosition, T::dstPosition); ASCENDC_ASSERT((pool != Hardware::GM), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be Hardware::GM"); }); ASCENDC_ASSERT((pool != Hardware::MAX), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be Hardware::MAX"); }); auto curPoolAddr = this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; auto ptr = que.bufStart; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((num * len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", num * len, bufferInitLen.at(pool)); }); auto pos_ = GetPosition(T::srcPosition, T::dstPosition); auto absAddr = GetBaseAddr(static_cast(pos_)); AscendCBufInit(static_cast(pos_), 0, num, reinterpret_cast(curPoolAddr + absAddr), len); #endif for (int32_t i = 0; i < num; i++, ptr++) { ptr->state = TBufState::FREE; ptr->freeBufEvt = T::freeBufEvt; ptr->enQueEvtID = INVALID_TEVENTID; ptr->freeBufEvtID = INVALID_TEVENTID; ptr->address = curPoolAddr; ptr->dataLen = len; ptr->usertag = -1; curPoolAddr += len; } ASCENDC_ASSERT((curPoolAddr <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, limits is %d", curPoolAddr, bufferInitLen.at(pool)); }); this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr = curPoolAddr; this->g_tpipeImpl.curBufSize_ += num; ASCENDC_ASSERT((this->g_tpipeImpl.curBufSize_ < QBUF_MAX_LEN), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, limits is %d", this->g_tpipeImpl.curBufSize_, QBUF_MAX_LEN); }); ASCENDC_ASSERT( (this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr <= this->g_tpipeImpl.tscmBufferPtr_), { KERNEL_LOG(KERNEL_ERROR, "tscm addr is %d, limits is %d", this->g_tpipeImpl.tscmBufferPtr_, this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr); }); return true; } template __aicore__ inline bool TPipe::InitBuffer(TBuf& buf, uint32_t len) { ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr int32_t bufHandleSize = 1; buf.bufStart = this->g_tpipeImpl.buf_ + this->g_tpipeImpl.curBufSize_; buf.bufLen = len; buf.offset = 0; constexpr auto pool = GetPhyType(pos); ASCENDC_ASSERT((pool != Hardware::GM), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be Hardware::GM"); }); auto curPoolAddr = g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; auto ptr = buf.bufStart; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "len is %u, exceed limits %d", len, bufferInitLen.at(pool)); }); auto absAddr = GetBaseAddr(static_cast(pos)); AscendCBufInit(static_cast(pos), 1, 1, reinterpret_cast(curPoolAddr + absAddr), len); #endif for (uint8_t i = 0; i < bufHandleSize; i++, ptr++) { ptr->state = TBufState::FREE; ptr->enQueEvtID = INVALID_TEVENTID; ptr->freeBufEvtID = INVALID_TEVENTID; ptr->address = curPoolAddr; ptr->dataLen = len; ptr->usertag = -1; curPoolAddr += len; } ASCENDC_ASSERT((curPoolAddr <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, exceed limits %d", curPoolAddr, bufferInitLen.at(pool)); }); this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr = curPoolAddr; this->g_tpipeImpl.curBufSize_ += bufHandleSize; ASCENDC_ASSERT((this->g_tpipeImpl.curBufSize_ < QBUF_MAX_LEN), { KERNEL_LOG(KERNEL_ERROR, "current total buffer num is %d, exceed limits %d", this->g_tpipeImpl.curBufSize_, QBUF_MAX_LEN); }); return true; } template __aicore__ inline bool TPipe::InitBufPool(T &bufPool, uint32_t len) { static_assert( (T::isTbufPool), "TPipe::InitBufPool(T& bufPool, uint32_t len, U& shareBuf) only supports T as TbufPool"); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr auto pool = GetPhyType(T::poolPos); bufPool.g_tBufPoolImpl.startAddr_ = this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; bufPool.g_tBufPoolImpl.maxAddr_ = bufPool.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxLen_ = len; auto curPoolAddr = this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", len, bufferInitLen.at(pool)); }); auto pos = T::poolPos; auto absAddr = GetBaseAddr(static_cast(pos)); AscendCTBufPoolInit(static_cast(pos), reinterpret_cast(curPoolAddr + absAddr), len, reinterpret_cast(&bufPool.g_tBufPoolImpl)); #endif curPoolAddr += len; ASCENDC_ASSERT((curPoolAddr <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "curPoolAddr is %d, limits is %d", curPoolAddr, bufferInitLen.at(pool)); }); this->g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr = curPoolAddr; ASCENDC_ASSERT( (this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr <= this->g_tpipeImpl.tscmBufferPtr_), { KERNEL_LOG(KERNEL_ERROR, "tscm addr is %d, limits is %d", this->g_tpipeImpl.tscmBufferPtr_, this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr); }); return true; } template __aicore__ inline bool TPipe::InitBufPool(T &bufPool, uint32_t len, U &shareBuf) { static_assert((T::isTbufPool && U::isTbufPool), "TPipe::InitBufPool(T& bufPool, uint32_t len, U& shareBuf) only supports T and U as TBufPool"); ASCENDC_ASSERT((len > 0), { KERNEL_LOG(KERNEL_ERROR, "buffer length is %u, which shoud be larger than 0", len); }); len = (len + ONE_BLK_SIZE - MIN_BLOCK_LEN) / ONE_BLK_SIZE * ONE_BLK_SIZE; constexpr auto pool = GetPhyType(T::poolPos); ASCENDC_ASSERT((pool == GetPhyType(U::poolPos)), { KERNEL_LOG(KERNEL_ERROR, "Hardware type of input bufPool should be same with shareBuf"); }); bufPool.g_tBufPoolImpl.startAddr_ = shareBuf.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxAddr_ = bufPool.g_tBufPoolImpl.startAddr_; bufPool.g_tBufPoolImpl.maxLen_ = shareBuf.g_tBufPoolImpl.maxLen_; ASCENDC_ASSERT((len <= shareBuf.g_tBufPoolImpl.maxLen_), { KERNEL_LOG(KERNEL_ERROR, "Length of input bufPool should be shorter than len of shareBuf, which is %u", shareBuf.g_tBufPoolImpl.maxLen_); }); #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; ASCENDC_ASSERT((len <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "buffer size is %d, exceed limits %d", len, bufferInitLen.at(pool)); }); auto pos = T::poolPos; auto absAddr = GetBaseAddr(static_cast(pos)); AscendCTBufPoolInit(static_cast(pos), reinterpret_cast(bufPool.g_tBufPoolImpl.startAddr_ + absAddr), len, reinterpret_cast(&bufPool.g_tBufPoolImpl)); #endif return true; } template __aicore__ inline TEventID TPipe::AllocEventID() { ASCENDC_ASSERT((evt < HardEvent::MAX), { KERNEL_LOG(KERNEL_ERROR, "illegal event %d", static_cast(evt)); }); auto ptr = this->g_tpipeImpl.eventPool_ + EventToIndex(evt); auto lastId = sff0(ptr->eventOccupy); ASCENDC_ASSERT((lastId < QUE_MAX_EVENT && lastId >= 0), { KERNEL_LOG(KERNEL_ERROR, "current id is %ld, max buffer number in same queue position is %d", lastId, QUE_MAX_EVENT); }); ptr->eventOccupy = sbitset1(ptr->eventOccupy, lastId); return lastId; } template __aicore__ inline void TPipe::ReleaseEventID(TEventID id) { ASCENDC_ASSERT((id >= 0 && id < QUE_MAX_EVENT), { KERNEL_LOG(KERNEL_ERROR, "current id is %d, which should be larger than 0, and smaller than %d", static_cast(id), QUE_MAX_EVENT); }); ASCENDC_ASSERT((evt != HardEvent::MAX), { KERNEL_LOG(KERNEL_ERROR, "evt cannot be HardEvent::MAX"); }); auto ptr = this->g_tpipeImpl.eventPool_ + EventToIndex(evt); ptr->eventOccupy = sbitset0(ptr->eventOccupy, id); return; } __aicore__ inline TEventID TPipe::FetchEventID(HardEvent evt) { auto ptr = this->g_tpipeImpl.eventPool_ + EventToIndex(evt); auto lastId = sff0(ptr->eventOccupy); ASCENDC_ASSERT((lastId < QUE_MAX_EVENT && lastId >= 0), { KERNEL_LOG(KERNEL_ERROR, "current id is %ld, max buffer number in same queue position is %d", lastId, QUE_MAX_EVENT); }); return lastId; } template __aicore__ inline TEventID TPipe::FetchEventID() { auto ptr = this->g_tpipeImpl.eventPool_ + EventToIndex(evt); auto lastId = sff0(ptr->eventOccupy); ASCENDC_ASSERT((lastId < QUE_MAX_EVENT && lastId >= 0), { KERNEL_LOG(KERNEL_ERROR, "current id is %ld, max buffer number in same queue position is %d", lastId, QUE_MAX_EVENT); }); return lastId; } template [[deprecated("NOTICE: GetAbsAddr has been deprecated and will be removed in the next version. " "Please do not use it!")]] __aicore__ inline TBuffAddr TPipe::GetAbsAddr(int32_t offset, int32_t len) const { TBuffAddr addr; addr.logicPos = static_cast(pos); addr.bufferHandle = nullptr; addr.bufferAddr = offset; addr.dataLen = len; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 auto bufferInitLen = ConstDefiner::Instance().bufferInitLen; constexpr auto pool = GetPhyType(pos); ASCENDC_ASSERT((pool != Hardware::GM), { KERNEL_LOG(KERNEL_ERROR, "buffer pos can not be Hardware::GM"); }); ASCENDC_ASSERT(((offset + len) <= bufferInitLen.at(pool)), { KERNEL_LOG(KERNEL_ERROR, "offset is %d, len is %d, exceed limits %d", offset, len, bufferInitLen.at(pool)); }); auto absAddr = this->g_tpipeImpl.bufPoolBaseAddr_[static_cast(pool)].absAddr; addr.absAddr = absAddr + addr.bufferAddr; #endif return addr; } template [[deprecated("NOTICE: GetAbsAddr has been deprecated and will be removed in the next version. " "Please do not use it!")]] __aicore__ inline __sync_noalias__ LocalTensor TPipe::GetAbsAddr(int32_t offset, int32_t size) const { TBuffAddr addr = GetAbsAddr(offset, static_cast((size * sizeof(T)))); LocalTensor tensor; tensor.SetAddr(addr); return tensor; } __aicore__ inline void TPipe::InitShareBufStart(uint32_t mode, uint32_t* shareLens, uint32_t lens, uint8_t subBlockIdx) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((lens == static_cast(TShareBuf::ShareHard::MAX)), { KERNEL_LOG(KERNEL_ERROR, "lens is %d, which should be %d", lens, static_cast(TShareBuf::ShareHard::MAX)); }); #else (void)(lens); #endif ASCENDC_ASSERT((subBlockIdx == 0 || subBlockIdx == 1), { KERNEL_LOG(KERNEL_ERROR, "subBlockIdx is %d, which should only be 0/1", subBlockIdx); }); AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::L1), Hardware::L1, subBlockIdx); AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::L0C), Hardware::L0C, subBlockIdx); #if __CCE_AICORE__ < 220 AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::UB), Hardware::UB, subBlockIdx); #endif this->g_tpipeImpl.bufPool_[static_cast(Hardware::L0A)].maxAddr = 0; this->g_tpipeImpl.bufPool_[static_cast(Hardware::L0B)].maxAddr = 0; // v100 Shouldn't Use Bias Table this->g_tpipeImpl.bufPool_[static_cast(Hardware::BIAS)].maxAddr = 0; return; } __aicore__ inline void TPipe::InitShareBufEnd() { // debug methods need to be added. this->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr = g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::L1)]; this->g_tpipeImpl.bufPool_[static_cast(Hardware::L0C)].maxAddr = g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::L0C)]; #if __CCE_AICORE__ < 220 this->g_tpipeImpl.bufPool_[static_cast(Hardware::UB)].maxAddr = g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::UB)]; #endif return; } __aicore__ inline void InitShareBufStart(TPipe* tpipe, uint32_t mode, uint32_t* shareLens, uint32_t lens, uint8_t subBlockIdx) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((lens == static_cast(TShareBuf::ShareHard::MAX)), { KERNEL_LOG(KERNEL_ERROR, "lens is %d, which should be %d", lens, static_cast(TShareBuf::ShareHard::MAX)); }); #else (void)(lens); #endif ASCENDC_ASSERT((subBlockIdx == 0 || subBlockIdx == 1), { KERNEL_LOG(KERNEL_ERROR, "subBlockIdx is %d, which should only be 0/1", subBlockIdx); }); tpipe->AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::L1), Hardware::L1, subBlockIdx); tpipe->AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::L0C), Hardware::L0C, subBlockIdx); #if __CCE_AICORE__ < 220 tpipe->AuxShareBufStart(mode, shareLens, static_cast(TShareBuf::ShareHard::UB), Hardware::UB, subBlockIdx); #endif tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::L0A)].maxAddr = 0; tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::L0B)].maxAddr = 0; // v100 Shouldn't Use Bias Table tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::BIAS)].maxAddr = 0; return; } __aicore__ inline void InitShareBufEnd(TPipe* tpipe) { // debug methods need to be added. tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::L1)].maxAddr = tpipe->g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::L1)]; tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::L0C)].maxAddr = tpipe->g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::L0C)]; #if __CCE_AICORE__ < 220 tpipe->g_tpipeImpl.bufPool_[static_cast(Hardware::UB)].maxAddr = tpipe->g_tpipeImpl.shareBufPool_.maxAddr[static_cast(TShareBuf::ShareHard::UB)]; #endif return; } template __aicore__ inline void TPipe::InitSpmBuffer(const GlobalTensor& workspace, const int32_t bufferSize) { g_tpipeImpl.spmInfo_.spmBuffSize = bufferSize; g_tpipeImpl.spmInfo_.spmAddr = reinterpret_cast(workspace.GetPhyAddr()); g_tpipeImpl.spmInfo_.spmBufType = static_cast(Hardware::GM); } __aicore__ inline void TPipe::InitSpmBuffer(const int32_t bufferSize) { #if __CCE_AICORE__ >= 220 (void)(bufferSize); ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "only support platform ascend910, ascend310p"); }); #else g_tpipeImpl.spmInfo_.spmBuffSize = bufferSize; TQueBind inQueue; constexpr auto pool = GetPhyType(QuePosition::A1); g_tpipeImpl.spmInfo_.spmAddr = g_tpipeImpl.bufPool_[static_cast(pool)].maxAddr; #ifdef __CCE_KT_TEST__ auto absAddr = GetBaseAddr(static_cast(TPosition::A1)); g_tpipeImpl.spmInfo_.spmAddr = g_tpipeImpl.spmInfo_.spmAddr + reinterpret_cast(absAddr); #endif InitBuffer(inQueue, 1, bufferSize); g_tpipeImpl.spmInfo_.spmBufType = static_cast(Hardware::L1); #endif } template __aicore__ inline void TPipe::WriteSpmBuffer(const LocalTensor& writeLocal, const DataCopyParams& copyParams, int32_t writeOffset) { /* * before write, the local may come frome MTE2/V, so need insert MTE3 wait V/MTE2 * after write, the local may used to compute or copy out, need insert V/MTE2 wait MTE3 */ event_t eventIDVToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); SetFlag(eventIDVToMTE3); WaitFlag(eventIDVToMTE3); event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); SetFlag(eventIDMTE2ToMTE3); WaitFlag(eventIDMTE2ToMTE3); if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::GM)) { DataCopyUB2GMImpl(reinterpret_cast<__gm__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + writeOffset, reinterpret_cast<__ubuf__ T*>(writeLocal.GetPhyAddr()), copyParams); event_t eventIDMTE3ToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); SetFlag(eventIDMTE3ToMTE2); WaitFlag(eventIDMTE3ToMTE2); } else if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::L1)) { ASCENDC_ASSERT((writeOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "writeOffset is %d, which must be 32B aligned", writeOffset); }); DataCopyUB2L1Impl(reinterpret_cast<__cbuf__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + writeOffset, reinterpret_cast<__ubuf__ T*>(writeLocal.GetPhyAddr()), copyParams); event_t eventIDMTE3ToMTE1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE1)); SetFlag(eventIDMTE3ToMTE1); WaitFlag(eventIDMTE3ToMTE1); } event_t eventIDMTE3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); SetFlag(eventIDMTE3ToV); WaitFlag(eventIDMTE3ToV); } template __aicore__ inline void TPipe::ReadSpmBuffer(const LocalTensor& readLocal, const DataCopyParams& copyParams, int32_t readOffset) { /* * before read, the local may be calculate, so need insert MTE wait V * after read, the local may used to compute or copy out, need insert V/MTE2 wait MTE3 */ if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::GM)) { event_t eventIDVToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2)); event_t eventIDMTE2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); SetFlag(eventIDVToMTE2); WaitFlag(eventIDVToMTE2); DataCopyGM2UBImpl(reinterpret_cast<__ubuf__ T*>(readLocal.GetPhyAddr()), reinterpret_cast<__gm__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + readOffset, copyParams); SetFlag(eventIDMTE2ToV); WaitFlag(eventIDMTE2ToV); SetFlag(eventIDMTE2ToMTE3); WaitFlag(eventIDMTE2ToMTE3); } else if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::L1)) { ASCENDC_ASSERT((readOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "readOffset is %d, which must be 32B aligned", readOffset); }); event_t eventIDVToMTE1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE1)); event_t eventIDMTE1ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_V)); event_t eventIDMTE1ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE3)); SetFlag(eventIDVToMTE1); WaitFlag(eventIDVToMTE1); DataCopyL12UBImpl(reinterpret_cast<__ubuf__ T*>(readLocal.GetPhyAddr()), reinterpret_cast<__cbuf__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + readOffset, copyParams); SetFlag(eventIDMTE1ToV); WaitFlag(eventIDMTE1ToV); SetFlag(eventIDMTE1ToMTE3); WaitFlag(eventIDMTE1ToMTE3); } } template __aicore__ inline void TPipe::WriteSpmBuffer(const LocalTensor& writeLocal, const int32_t writeSize, int32_t writeOffset) { /* * before write, the local may come frome MTE2/V, so need insert MTE3 wait V/MTE2 * after write, the local may used to compute or copy out, need insert V/MTE2 wait MTE3 */ int computeSize = writeSize != 0 ? writeSize : GetShapeSize(writeLocal.GetShapeInfo()); struct DataCopyParams repeatParams; repeatParams.blockLen = computeSize / AscendCUtils::GetC0Count(sizeof(T)); event_t eventIDVToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); event_t eventIDMTE3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); SetFlag(eventIDVToMTE3); WaitFlag(eventIDVToMTE3); SetFlag(eventIDMTE2ToMTE3); WaitFlag(eventIDMTE2ToMTE3); if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::GM)) { DataCopyUB2GMImpl(reinterpret_cast<__gm__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + writeOffset, reinterpret_cast<__ubuf__ T*>(writeLocal.GetPhyAddr()), repeatParams); event_t eventIDMTE3ToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); SetFlag(eventIDMTE3ToMTE2); WaitFlag(eventIDMTE3ToMTE2); } else if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::L1)) { ASCENDC_ASSERT((writeOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "writeOffset is %d, which must be 32B aligned", writeOffset); }); ASCENDC_ASSERT((writeSize % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "writeSize is %d, which must be 32B aligned", writeSize); }); DataCopyUB2L1Impl(reinterpret_cast<__cbuf__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + writeOffset, reinterpret_cast<__ubuf__ T*>(writeLocal.GetPhyAddr()), repeatParams); event_t eventIDMTE3ToMTE1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE1)); SetFlag(eventIDMTE3ToMTE1); WaitFlag(eventIDMTE3ToMTE1); } SetFlag(eventIDMTE3ToV); WaitFlag(eventIDMTE3ToV); } template __aicore__ inline void TPipe::ReadSpmBuffer(const LocalTensor& readLocal, const int32_t readSize, int32_t readOffset) { /* * before read, the local may be calculate, so need insert MTE wait V * after read, the local may used to compute or copy out, need insert V/MTE2 wait MTE3 */ int computeSize = readSize != 0 ? readSize : GetShapeSize(readLocal.GetShapeInfo()); struct DataCopyParams repeatParams; repeatParams.blockLen = computeSize / AscendCUtils::GetC0Count(sizeof(T)); if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::GM)) { event_t eventIDVToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2)); event_t eventIDMTE2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); SetFlag(eventIDVToMTE2); WaitFlag(eventIDVToMTE2); DataCopyGM2UBImpl(reinterpret_cast<__ubuf__ T*>(readLocal.GetPhyAddr()), reinterpret_cast<__gm__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + readOffset, repeatParams); SetFlag(eventIDMTE2ToV); WaitFlag(eventIDMTE2ToV); SetFlag(eventIDMTE2ToMTE3); WaitFlag(eventIDMTE2ToMTE3); } else if (g_tpipeImpl.spmInfo_.spmBufType == static_cast(Hardware::L1)) { ASCENDC_ASSERT((readOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "readOffset is %d, which must be 32B aligned", readOffset); }); ASCENDC_ASSERT((readSize % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "readSize is %d, which must be 32B aligned", readSize); }); event_t eventIDVToMTE1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE1)); event_t eventIDMTE1ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_V)); event_t eventIDMTE1ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE3)); SetFlag(eventIDVToMTE1); WaitFlag(eventIDVToMTE1); DataCopyL12UBImpl(reinterpret_cast<__ubuf__ T*>(readLocal.GetPhyAddr()), reinterpret_cast<__cbuf__ T*>(g_tpipeImpl.spmInfo_.spmAddr) + readOffset, repeatParams); SetFlag(eventIDMTE1ToV); WaitFlag(eventIDMTE1ToV); SetFlag(eventIDMTE1ToMTE3); WaitFlag(eventIDMTE1ToMTE3); } } template __aicore__ inline uint64_t TPipe::GetQueueEndAddress() { Hardware hardType = GetPhyType(pos); ASCENDC_ASSERT((hardType == Hardware::UB), { KERNEL_LOG(KERNEL_ERROR, "hardType should be UB"); }); return this->g_tpipeImpl.bufPool_[static_cast(hardType)].maxAddr; } __aicore__ inline void TPipe::Destroy() { g_tpipeImpl.isDestroy = true; auto ptr = this->g_tpipeImpl.buf_; for (uint8_t i = 0; i < this->g_tpipeImpl.curBufSize_; i++, ptr++) { if (ptr->freeBufEvtID != INVALID_TEVENTID) { WaitFlagImpl(ptr->freeBufEvt, ptr->freeBufEvtID); ptr->freeBufEvtID = INVALID_TEVENTID; } } // for matmul macro, release M_MTE1 0 1 event id. #if __CCE_AICORE__ == 220 if ASCEND_IS_AIC { WaitFlag(0); ReleaseEventID(0); WaitFlag(1); ReleaseEventID(1); // For Bias WaitFlag(2); ReleaseEventID(2); } #elif __CCE_AICORE__ == 300 WaitFlag(0); ReleaseEventID(0); WaitFlag(1); ReleaseEventID(1); WaitFlag(2); ReleaseEventID(2); #endif pipe_barrier(PIPE_ALL); #if __CCE_AICORE__ == 200 dcci((__gm__ int64_t*)0, cache_line_t::ENTIRE_DATA_CACHE); #endif } __aicore__ inline void TPipe::Reset() { auto ptr = this->g_tpipeImpl.buf_; for (uint8_t i = 0; i < this->g_tpipeImpl.curBufSize_; i++, ptr++) { if (ptr->freeBufEvtID != INVALID_TEVENTID) { WaitFlagImpl(ptr->freeBufEvt, ptr->freeBufEvtID); ptr->freeBufEvtID = INVALID_TEVENTID; } } InitSocState(); ResetPool(); #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 for (int32_t i = 0; i < static_cast(Hardware::MAX); i++) { SetBufferCtx((Hardware)i, &g_tpipeImpl.bufPoolBaseAddr_[i]); } #endif } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 template [[deprecated("NOTICE: GetAbsAddr has been deprecated and will be removed in the next version. " "Please do not use it!")]] inline uint64_t TPipe::GetAbsAddr(const LocalTensor& tensor) { // Translates the CPU address to the actual physical address. // Currently, only L1 or UB address translation is supported. int8_t logicPos = tensor.GetPosition(); auto positionHardMap = ConstDefiner::Instance().positionHardMap; ASCENDC_ASSERT((positionHardMap.find((TPosition)logicPos) != positionHardMap.end()), { KERNEL_LOG(KERNEL_ERROR, "illegal logicPos %d ", static_cast