/** * Copyright (c) 2024 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ /* ! * \file cumsum.h * \brief */ #ifndef LIB_CUMSUM_H #define LIB_CUMSUM_H #include "kernel_tensor.h" #include "kernel_operator_intf.h" #include "../../impl/math/cumsum/cumsum_common_impl.h" #if ASCENDC_CPU_DEBUG #include "kernel_log.h" #endif #if __CCE_AICORE__ >= 200 namespace AscendC { #pragma begin_pipe(V) template __aicore__ inline void CheckParamsIsValid(const LocalTensor &dstTensor, const LocalTensor &lastRowTensor, const LocalTensor &srcTensor, const CumSumInfo &cumSumInfo, const bool outputLastRow) { bool ans = cumSumInfo.inner > 0 && (cumSumInfo.inner * sizeof(T) % ONE_BLK_SIZE == 0); ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "inner is %u, is not 32B aligned.", cumSumInfo.inner); }); ans = srcTensor.GetSize() >= (cumSumInfo.inner * cumSumInfo.outter); ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "srcTensor size isn't enough!."); }); ans = dstTensor.GetSize() >= (cumSumInfo.inner * cumSumInfo.outter); ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "dstTensor size isn't enough!."); }); if (outputLastRow) { ans = lastRowTensor.GetSize() >= cumSumInfo.inner; ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "outputLastRow size isn't enough!."); }); } } constexpr CumSumConfig defaultCumSumConfig = {true, false, true}; /* ! * \brief This function calculates the average based on the orientation of the last axis or fist axis. * For details about the interface description, see * https://pytorch.org/docs/stable/generated/torch.cumsum.html * * \note support data type: half and float * * \param [out] dstTensor, output LocalTensor * \param [out] lastRowTensor, the last row of the output LocalTensor * \param [in] srcTensor, input LocalTensor * \param [in] sharedTmpBuffer, input local temporary Tensor * \param [in] cumSumInfo, shape information of srcTensor */ template __aicore__ inline void CumSum(LocalTensor &dstTensor, LocalTensor &lastRowTensor, const LocalTensor &srcTensor, LocalTensor &sharedTmpBuffer, const CumSumInfo &cumSumInfo) { if ASCEND_IS_AIC { return; } #if ASCENDC_CPU_DEBUG CheckParamsIsValid(dstTensor, lastRowTensor, srcTensor, cumSumInfo, config.outputLastRow); #endif if constexpr (config.isLastAxis) { uint32_t minCastTempBufferSize = 0; if constexpr (sizeof(T) == 2) { // 2 is for half minCastTempBufferSize = cumSumInfo.inner * NCHW_CONV_ADDR_LIST_SIZE * sizeof(half); } const uint32_t minTmpBufferSize = minCastTempBufferSize + NCHW_CONV_ADDR_LIST_SIZE * cumSumInfo.inner * sizeof(T) * 2; // 2次transpose均需要tempBuffer const uint32_t tmpBufferSize = sharedTmpBuffer.GetSize(); #if ASCENDC_CPU_DEBUG ASCENDC_ASSERT((tmpBufferSize >= minTmpBufferSize), { KERNEL_LOG(KERNEL_ERROR, "tmpBufferSize can't smaller than minTmpBufferSize, tmpBufferSize is %u, minTmpBufferSize is %u!", tmpBufferSize, minTmpBufferSize); }); #endif // 针对outter做for循环,每次最少处理16行的数据 const uint32_t oneRepeateSize = tmpBufferSize / minTmpBufferSize * NCHW_CONV_ADDR_LIST_SIZE; const uint32_t rangeM = cumSumInfo.outter / oneRepeateSize; const uint32_t tailM = cumSumInfo.outter - oneRepeateSize * rangeM; uint32_t dstLocalOffset = 0; uint32_t srcLocalOffset = 0; LocalTensor tmpBuffer = sharedTmpBuffer.ReinterpretCast(); for (uint32_t i = 0; i < rangeM; i++) { CumSumLastDim( dstTensor[dstLocalOffset], srcTensor[srcLocalOffset], tmpBuffer, {oneRepeateSize, cumSumInfo.inner}); dstLocalOffset += cumSumInfo.inner * oneRepeateSize; srcLocalOffset += cumSumInfo.inner * oneRepeateSize; } if (tailM != 0) { CumSumLastDim( dstTensor[dstLocalOffset], srcTensor[srcLocalOffset], tmpBuffer, {tailM, cumSumInfo.inner}); } } else { CumSumFirstDim(dstTensor, srcTensor, sharedTmpBuffer, cumSumInfo); } if constexpr (config.outputLastRow) { SetMaskCount(); SetVectorMask(0, cumSumInfo.inner); Adds(lastRowTensor, dstTensor[(cumSumInfo.outter - 1) * cumSumInfo.inner], 0, MASK_PLACEHOLDER, 1, {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); PipeBarrier(); SetMaskNorm(); ResetMask(); } } /* ! * \brief This function calculates the average based on the orientation of the last axis or fist axis. * For details about the interface description, see * https://pytorch.org/docs/stable/generated/torch.cumsum.html * * \note support data type: half and float * * \param [out] dstTensor, output LocalTensor * \param [out] lastRowTensor, the last row of the output LocalTensor * \param [in] srcTensor, input LocalTensor * \param [in] cumSumInfo, shape information of srcTensor */ template __aicore__ inline void CumSum(LocalTensor &dstTensor, LocalTensor &lastRowTensor, const LocalTensor &srcTensor, const CumSumInfo &cumSumInfo) { if ASCEND_IS_AIC { return; } LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); CumSum(dstTensor, lastRowTensor, srcTensor, sharedTmpBuffer, cumSumInfo); } #pragma end_pipe } // namespace AscendC #endif #endif // LIB_CUMSUM_H