/** * Copyright (c) 2024 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ /*! * \file mean.h * \brief */ #ifndef LIB_MEAN_MEAN_H #define LIB_MEAN_MEAN_H #include "kernel_tensor.h" #include "kernel_operator_intf.h" #include "kernel_pop_stack_buffer.h" #if ASCENDC_CPU_DEBUG #include "kernel_log.h" #include #include #endif #if __CCE_AICORE__ >= 200 namespace AscendC { constexpr uint32_t HALF_NUM_PER = 128; constexpr uint32_t FLOAT_NUM_PER = 64; struct MeanParams { uint32_t outter = 1; uint32_t inner; // inner = 32-byte alignment of n, inner = (n *sizeof(T) + 32 - 1) / 32 * 32 / sizeof(T) uint32_t n; // actual length of the tensor }; template __aicore__ inline void CheckParamsIsValid(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const MeanParams& meanParams, uint32_t tmpBufferSize) { #if ASCENDC_CPU_DEBUG bool ans = meanParams.outter != 0 && meanParams.inner != 0 && (meanParams.inner * sizeof(T) % ONE_BLK_SIZE == 0); ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "outter and inner can't be zero, inner must be 32B aligned"); }); ans = ((meanParams.n >= 1) && (meanParams.n <= meanParams.inner)); ASCENDC_ASSERT( ans, { KERNEL_LOG(KERNEL_ERROR, "n must be greater than or equal to 1 and less than or equal to inner"); }); ans = srcTensor.GetSize() >= meanParams.outter * meanParams.inner; ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "srcTensor size isn't enough!"); }); ans = dstTensor.GetSize() * sizeof(T) >= (meanParams.outter * sizeof(T) + ONE_BLK_SIZE - 1) / ONE_BLK_SIZE * ONE_BLK_SIZE; ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "dstTensor size isn't enough!"); }); ans = sharedTmpBuffer.GetSize() >= tmpBufferSize; ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "sharedTmpBuffer size isn't enough!"); }); #endif } __aicore__ inline void MeanCast(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const MeanParams& meanParams) { uint32_t elementNumPerRep = FLOAT_NUM_PER; uint32_t repeateTimes = (meanParams.n + elementNumPerRep - 1) / elementNumPerRep; uint32_t finalWorkSize = meanParams.inner * sizeof(float) + (repeateTimes + ONE_BLK_SIZE - 1) / ONE_BLK_SIZE * ONE_BLK_SIZE; #if ASCENDC_CPU_DEBUG CheckParamsIsValid(dstTensor, srcTensor, sharedTmpBuffer, meanParams, finalWorkSize); #endif const UnaryRepeatParams unaryParams; float scalarValue = static_cast(1) / static_cast(static_cast(meanParams.n)); LocalTensor TmpTensor = sharedTmpBuffer.ReinterpretCast(); LocalTensor castTensor = sharedTmpBuffer.ReinterpretCast(); SetMaskCount(); for (uint32_t row = 0; row < meanParams.outter; ++row) { SetVectorMask(0, meanParams.n); Cast(TmpTensor, srcTensor[row * meanParams.inner], RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE}); PipeBarrier(); RepeatReduceSum(TmpTensor[meanParams.inner], TmpTensor, 1, MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); uint32_t reduceNums = repeateTimes; while (reduceNums > 1) { SetVectorMask(0, reduceNums); reduceNums = (reduceNums + elementNumPerRep - 1) / elementNumPerRep; RepeatReduceSum(TmpTensor[meanParams.inner], TmpTensor[meanParams.inner], 1, MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); } SetVectorMask(0, 1); Muls(TmpTensor[meanParams.inner], TmpTensor[meanParams.inner], scalarValue, MASK_PLACEHOLDER, 1, unaryParams); PipeBarrier(); Cast(castTensor, TmpTensor[meanParams.inner], RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, {1, 1, HALF_DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); PipeBarrier(); RepeatReduceSum(dstTensor[row], castTensor, 1, MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); } SetMaskNorm(); ResetMask(); } template __aicore__ inline void MeanForOneRepeatTime(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const MeanParams& meanParams, T scalarValue) { SetVectorMask(0, meanParams.n); for (uint32_t row = 0; row < meanParams.outter; ++row) { RepeatReduceSum(dstTensor[row], srcTensor[row * meanParams.inner], 1, MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); } PipeBarrier(); SetVectorMask(0, meanParams.outter); const UnaryRepeatParams unaryParams; Muls(dstTensor, dstTensor, scalarValue, MASK_PLACEHOLDER, 1, unaryParams); SetMaskNorm(); ResetMask(); } template __aicore__ inline void MeanCommon(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const MeanParams& meanParams) { uint32_t elementNumPerRep = FLOAT_NUM_PER; if constexpr (sizeof(T) == sizeof(half)) { elementNumPerRep = HALF_NUM_PER; } uint32_t repeateTimes = (meanParams.n + elementNumPerRep - 1) / elementNumPerRep; uint32_t finalWorkSize = (repeateTimes + ONE_BLK_SIZE - 1) / ONE_BLK_SIZE * ONE_BLK_SIZE; #if ASCENDC_CPU_DEBUG CheckParamsIsValid(dstTensor, srcTensor, sharedTmpBuffer, meanParams, finalWorkSize); #endif T scalarValue = static_cast(static_cast(1) / static_cast(static_cast(meanParams.n))); SetMaskCount(); if (repeateTimes == 1) { return MeanForOneRepeatTime(dstTensor, srcTensor, meanParams, scalarValue); } const UnaryRepeatParams unaryParams; LocalTensor TmpTensor = sharedTmpBuffer.ReinterpretCast(); for (uint32_t row = 0; row < meanParams.outter; ++row) { uint32_t reduceNums = repeateTimes; SetVectorMask(0, meanParams.n); RepeatReduceSum(TmpTensor, srcTensor[row * meanParams.inner], 1, MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); while (reduceNums > 1) { SetVectorMask(0, reduceNums); reduceNums = (reduceNums + elementNumPerRep - 1) / elementNumPerRep; if (reduceNums == 1) { RepeatReduceSum(dstTensor[row], TmpTensor, 1, MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); } else { RepeatReduceSum(TmpTensor, TmpTensor, 1, MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); } PipeBarrier(); } } SetVectorMask(0, meanParams.outter); Muls(dstTensor, dstTensor, scalarValue, MASK_PLACEHOLDER, 1, unaryParams); SetMaskNorm(); } /* ! * \brief This function calculates the average based on the orientation of the last axis. * For details about the interface description, see * https://pytorch.org/docs/stable/generated/torch.mean.html * * \note support data type: half and float * * \param [out] dstTensor, output LocalTensor * \param [in] srcTensor, input LocalTensor * \param [in] sharedTmpBuffer, input local temporary Tensor * \param [in] meanParams, shape information of srcTensor */ template __aicore__ inline void Mean(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const MeanParams& meanParams) { if ASCEND_IS_AIC { return; } ASCENDC_ASSERT(((std::is_same::value && std::is_same::value) || (std::is_same::value && std::is_same::value) || (std::is_same::value && std::is_same::value)), {KERNEL_LOG(KERNEL_ERROR, "Two conditions are supported: " "1.T is half or float , and accType is same with T; " "2.T is half and accType is float.");}); if constexpr (sizeof(T) == sizeof(half) && sizeof(accType) == sizeof(float)) { MeanCast(dstTensor, srcTensor, sharedTmpBuffer, meanParams); } else { MeanCommon(dstTensor, srcTensor, sharedTmpBuffer, meanParams); } } /* ! * \brief This function calculates the average based on the orientation of the last axis. * For details about the interface description, see * https://pytorch.org/docs/stable/generated/torch.mean.html * * \note support data type: half and float * * \param [out] dstTensor, output LocalTensor * \param [in] srcTensor, input LocalTensor * \param [in] meanParams, shape information of srcTensor */ template __aicore__ inline void Mean( const LocalTensor& dstTensor, const LocalTensor& srcTensor, const MeanParams& meanParams) { if ASCEND_IS_AIC { return; } LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); Mean(dstTensor, srcTensor, sharedTmpBuffer, meanParams); } #pragma end_pipe } // namespace AscendC #endif #endif // LIB_MEAN_MEAN_H