/** * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* ! * \file kernel_utils.h * \brief */ #ifndef ASCENDC_MODULE_UTILS_H #define ASCENDC_MODULE_UTILS_H #define USE_ISA_INS 1 #define GM_ADDR __gm__ uint8_t* #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) #endif #ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) #endif #include "kernel_macros.h" #include "kernel_log.h" #include "kernel_event.h" #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 #include #include #include #include #include #include "stub_def.h" #include "stub_fun.h" #endif // __CCE_KT_TEST__ // this marco is used to define new array with dim #define ASCENDC_SHAPE(dimValue, ...) \ dimValue, (const uint32_t[]) \ { \ __VA_ARGS__ \ } // define macro for deterministic compile options enum KernelMetaType : uint8_t { KERNEL_TYPE_AIV_ONLY, KERNEL_TYPE_AIC_ONLY, KERNEL_TYPE_MIX_AIV_1_0, KERNEL_TYPE_MIX_AIC_1_0, KERNEL_TYPE_MIX_AIC_1_1, KERNEL_TYPE_MIX_AIC_1_2, KERNEL_TYPE_AICORE, KERNEL_TYPE_VECTORCORE, KERNEL_TYPE_MIX_AICORE, KERNEL_TYPE_MIX_VECTOR_CORE, KERNEL_TYPE_MAX, }; enum KernelType { K_TYPE_AICORE = 1, // c100/m200 K_TYPE_AIC = 2, // v220-cube K_TYPE_AIV = 3, // v220-vec K_TYPE_MIX_AIC_MAIN = 4, // v220 mix cube/vector 1:2 K_TYPE_MIX_AIV_MAIN = 5, // v220 mix vector/cube 1:2 K_TYPE_AIC_ROLLBACK = 6, // v220-cube，aic rollback K_TYPE_AIV_ROLLBACK = 7, // v220-vec，aiv rollback K_TYPE_MAX }; struct BaseTlv { // TLV头部定义 unsigned short type; unsigned short len; }; enum FuncMetaType { // 函数级TLV类型 F_TYPE_KTYPE = 1, // kernel type tlv F_TYPE_CROSS_CORE_SYNC = 2, // cross core sync F_TYPE_MIX_TASK_RATION = 3, // MIX CORE TYPE F_TYPE_MAX }; enum CrossCoreSyncType { // 函数级TLV类型 C_TYPE_USE_SYNC = 1, // use cross core sync C_TYPE_MAX }; struct OpSystemRunCfg { uint64_t l2Cacheoffset; }; namespace AscendC { enum class CacheMode { CACHE_MODE_DISABLE = 0, CACHE_MODE_NORMAL = 1, CACHE_MODE_LAST = 2, CACHE_MODE_PERSISTENT = 4 }; enum class CacheRwMode { READ = 1, WRITE = 2, RW = 3 }; #if __CCE_AICORE__ == 200 || __CCE_AICORE__ == 220 struct KernelSystemCfg { uint64_t l2CacheOffset; public: __aicore__ void SetL2CacheOffset(uint64_t offset) { l2CacheOffset = offset; } template __aicore__ __inline__ __gm__ T* L2CacheAlter(__gm__ T* addr, CacheMode mode) const { if (mode == CacheMode::CACHE_MODE_DISABLE) { return reinterpret_cast<__gm__ T*>(reinterpret_cast(addr) + l2CacheOffset); } else { return addr; } } }; __BLOCK_LOCAL__ __inline__ KernelSystemCfg g_kernelSystemCfg; __aicore__ inline KernelSystemCfg* GetKernelSystemCfg() { return &g_kernelSystemCfg; } #endif // __CCE_AICORE__ == 200 || __CCE_AICORE__ == 220 template __aicore__ __inline__ __gm__ T* L2CacheAlter(__gm__ T* addr, CacheMode mode) { #if __CCE_AICORE__ == 200 || __CCE_AICORE__ == 220 return GetKernelSystemCfg()->L2CacheAlter(addr, mode); #else return addr; #endif } } struct FunMetaKType { BaseTlv head; unsigned int ktype; }; struct FunMetaCrossCoreType { BaseTlv head; unsigned int usedCrossCoreSync; }; struct FunMetaMixCoreType { BaseTlv head; unsigned short taskRation0; unsigned short taskRation1; }; struct FunLevelKType { struct FunMetaKType ktypeMeta; }; struct FunLevelCrossCoreType { struct FunMetaKType ktypeMeta; struct FunMetaCrossCoreType crossCoreType; }; struct FunLevelMixCoreType { struct FunMetaKType ktypeMeta; struct FunMetaMixCoreType mixCoreType; }; #ifdef __CHECK_FEATURE_AT_PRECOMPILE #define ENABLE_FEATURE_FOR_COMPILE(f, val) auto __enable_feature_for_compile_##f = val #else #define ENABLE_FEATURE_FOR_COMPILE(f, val) #endif #define ENABLE_DETERMINISTIC() ENABLE_FEATURE_FOR_COMPILE(deterministic, 1) #define KERNEL_TASK_TYPE(key, value) ENABLE_FEATURE_FOR_COMPILE(key, value) #define KERNEL_TASK_TYPE_DEFAULT(value) ENABLE_FEATURE_FOR_COMPILE(default, value) #define ENABLE_PRINTF() ENABLE_FEATURE_FOR_COMPILE(printf, 1) #define ENABLE_PRINTF_DUMP_SIZE() ENABLE_FEATURE_FOR_COMPILE(printfBufSize, 1048576) #define ENABLE_ASSERT() ENABLE_FEATURE_FOR_COMPILE(assert, 1) #define ENABLE_ASSERT_DUMP_SIZE() ENABLE_FEATURE_FOR_COMPILE(assertBufSize, 1024) #ifndef ONE_CORE_DUMP_SIZE #define ONE_CORE_DUMP_SIZE (1024 * 1024) #endif namespace AscendC { constexpr int32_t MIX = 0; constexpr int32_t AIC = 1; constexpr int32_t AIV = 2; constexpr size_t DUMP_UINTSIZE = ONE_CORE_DUMP_SIZE; } // namespace AscendC #if defined(__CCE_KT_TEST__) extern thread_local int32_t g_coreType; #define ASCEND_IS_AIV (g_coreType == AIV) #define ASCEND_IS_AIC (g_coreType == AIC) #define ASCEND_IS_NOT_AIV (g_coreType != AIV) #define ASCEND_IS_NOT_AIC (g_coreType != AIC) #else #if defined(__DAV_C220_CUBE__) constexpr int32_t g_coreType = AscendC::AIC; #elif defined(__DAV_C220_VEC__) constexpr int32_t g_coreType = AscendC::AIV; #else constexpr int32_t g_coreType = AscendC::MIX; #endif #define ASCEND_IS_AIV constexpr(g_coreType == AIV) #define ASCEND_IS_AIC constexpr(g_coreType == AIC) #define ASCEND_IS_NOT_AIV constexpr(g_coreType != AIV) #define ASCEND_IS_NOT_AIC constexpr(g_coreType != AIC) #endif #include #ifndef TILING_KEY_VAR #if defined(__CCE_KT_TEST__) extern thread_local uint64_t g_tilingKey; #else #if __CCE_AICORE__ == 200 [[block_local]] uint64_t g_tilingKey; #else [[workgroup_local]] __gm__ uint64_t g_tilingKey; #endif #endif #define TILING_KEY_VAR g_tilingKey #endif #define TILING_KEY_IS(k) (TILING_KEY_VAR == (k)) #if defined(ASCENDC_OOM) && ASCENDC_OOM == 1 constexpr bool g_gm_overflow_check = true; constexpr uint64_t g_oomAddrRangeMaxSize = 128; struct OomAddrRange { uintptr_t addr[g_oomAddrRangeMaxSize]; uint64_t len[g_oomAddrRangeMaxSize]; uint64_t count; }; __BLOCK_LOCAL__ __inline__ OomAddrRange g_oomAddrArange; #else constexpr bool g_gm_overflow_check = false; #endif namespace AscendC { #ifdef __CCE_KT_TEST__ #define PRELOAD(len) \ {} #else #define PRELOAD(len) \ do { \ uint64_t pc; \ asm volatile("mov %0, pc \n" : "=l"(pc) : :); \ preload((void*)pc, len); \ } while (0) #endif __aicore__ inline uint32_t DivCeil(uint32_t a, uint32_t b) { return (a + b - 1) / b; } __aicore__ inline uint32_t AlignUp(uint32_t a, uint32_t b) { return DivCeil(a, b) * b; } __aicore__ constexpr inline uint32_t ConstCeil(uint32_t a, uint32_t b) { return (a + b - 1) / b; } __aicore__ inline uint32_t Ceil(uint32_t a, uint32_t b) { return (a + b - 1) / b; } struct TQueConfig { bool nd2nz = false; bool nz2nd = false; bool scmBlockGroup = false; uint32_t bufferLen = 0; uint32_t bufferNumber = 0; uint32_t consumerSize = 0; TPosition consumer[8] = {}; }; __aicore__ constexpr TQueConfig GetTQueConfig(bool nd2nzIn, bool nz2ndIn, bool scmBlockGroupIn, uint32_t bufferLenIn, uint32_t bufferNumberIn, uint32_t consumerSizeIn, const TPosition consumerIn[]) { return { .nd2nz = nd2nzIn, .nz2nd = nz2ndIn, .scmBlockGroup = scmBlockGroupIn, .bufferLen = bufferLenIn, .bufferNumber = bufferNumberIn, .consumerSize = consumerSizeIn, .consumer = {consumerIn[0], consumerIn[1], consumerIn[2], consumerIn[3], consumerIn[4], consumerIn[5], consumerIn[6], consumerIn[7]} }; } __aicore__ constexpr TQueConfig GetTQueConfig(const int32_t mask) { return { .nd2nz = static_cast(static_cast(mask) & 0x1u), .nz2nd = static_cast((static_cast(mask) & 0x2u) >> 1), .scmBlockGroup = static_cast((static_cast(mask) & 0x4u) >> 2), .bufferLen = 0, .bufferNumber = 0, .consumerSize = 0, .consumer = {TPosition::MAX, TPosition::MAX, TPosition::MAX, TPosition::MAX, TPosition::MAX, TPosition::MAX, TPosition::MAX, TPosition::MAX} }; } __aicore__ constexpr TQueConfig GetTQueConfig(const TQueConfig* conf) { return { .nd2nz = conf->nd2nz, .nz2nd = conf->nz2nd, .scmBlockGroup = conf->scmBlockGroup, .bufferLen = conf->bufferLen, .bufferNumber = conf->bufferNumber, .consumerSize = conf->consumerSize, .consumer = {conf->consumer[0], conf->consumer[1], conf->consumer[2], conf->consumer[3], conf->consumer[4], conf->consumer[5], conf->consumer[6], conf->consumer[7]} }; } template struct BoolInst { using Type = BoolInst; static constexpr bool value = b; }; using TrueType = BoolInst; using FalseType = BoolInst; template struct IsSameType : public FalseType {}; template struct IsSameType : public TrueType {}; const int32_t DEFAULT_BLK_NUM = 8; const int32_t POWER_MASK_NUM = 8; const int32_t HALF_FACTOR = 2; const int32_t DEFAULT_BLK_STRIDE = 1; const uint8_t DEFAULT_REPEAT_STRIDE = 8; const uint8_t HALF_DEFAULT_REPEAT_STRIDE = 4; const uint8_t ONE_FOURTH_DEFAULT_REPEAT_STRIDE = 2; const uint64_t FULL_MASK = 0xffffffffffffffff; const uint64_t CONST_MASK_VALUE = 0x8000000000000000; const uint16_t MAX_HALF_MASK_LEN = 64; const int32_t DEFAULT_C0_SIZE = 32; const int32_t DEFAULT_BLOCK_SIZE = 256; const int32_t MAX_REPEAT_TIMES = 255; const int32_t MIN_REPEAT_TIMES = 0; const bool DEFAULT_REPEAT_STRIDE_MODE = 0; const bool STRIDE_SIZE_MODE = 0; const int32_t ONE_BYTE_BIT_SIZE = 8; const int32_t ONE_DUMP_BACKUP_SIZE = 1024; const int32_t DUMP_UB_SIZE = 256; const int32_t DUMP_EXC_FLAG = 7; const uint32_t TOTAL_L0A_SIZE = 64 * 1024; const uint32_t TOTAL_L0B_SIZE = 64 * 1024; const uint32_t TMP_UB_SIZE = 8 * 1024; const uint32_t MAX_SLICE_SIZE = 6 * 256; const uint32_t F32_INF = 0x7f800000; const uint32_t F32_NEG_INF = 0xff800000; const uint32_t F32_NAN = 0x7fc00000; // BlockInfo Pos const uint32_t BLOCK_INFO_LEN_POS = 0; const uint32_t BLOCK_INFO_CORE_POS = 1; const uint32_t BLOCK_INFO_BLOCKNUM_POS = 2; const uint32_t BLOCK_INFO_DUMPOFFSET_POS = 3; const uint32_t BLOCK_INFO_MAGIC_POS = 4; const uint32_t BLOCK_INFO_RSV_POS = 5; const uint32_t BLOCK_INFO_DUMP_ADDR = 6; const uint32_t BLOCK_INFO_MAGIC_NUM = 0x5aa5bccd; // DUMP_META Pos 以uint8_t为单位计算位置 const uint32_t DUMP_META_TYPE_POS = 0; const uint32_t DUMP_META_LEN_POS = 4; const uint16_t DUMP_META_BLOCK_DIM_POS = 8; const uint8_t DUMP_META_CORE_TYPE_POS = 10; const uint8_t DUMP_META_TASK_RATION = 11; const uint32_t DUMP_META_RSV_POS = 12; // DumpMessageHead Pos const uint32_t DUMP_MESSAGE_HEAD_TYPE_POS = 0; const uint32_t DUMP_MESSAGE_HEAD_LEN_POS = 1; const uint32_t DUMP_MESSAGE_HEAD_ADDR_POS = 2; const uint32_t DUMP_MESSAGE_HEAD_DATA_TYPE_POS = 3; const uint32_t DUMP_MESSAGE_HEAD_DESC_POS = 4; const uint32_t DUMP_MESSAGE_HEAD_BUFFERID_POS = 5; const uint32_t DUMP_MESSAGE_HEAD_POSITION_POS = 6; const uint32_t DUMP_MESSAGE_HEAD_RSV_POS = 7; const uint32_t DUMP_SCALAR_POS = 8; const uint32_t DUMP_CORE_COUNT = 75; const uint32_t DUMP_WORKSPACE_SIZE = DUMP_CORE_COUNT * ONE_CORE_DUMP_SIZE; // DumpShapeMessageHead Pos const uint32_t DUMP_SHAPE_MESSAGE_HEAD_TYPE_POS = 0; const uint32_t DUMP_SHAPE_MESSAGE_HEAD_LEN_POS = 1; const uint32_t DUMP_SHAPE_MESSAGE_HEAD_DIM_POS = 2; const uint32_t DUMP_SHAPE_MESSAGE_HEAD_SHAPE_START_POS = 3; const uint32_t DUMP_SHAPE_MESSAGE_HEAD_RSV_POS = 11; const uint32_t DUMP_SHAPE_MESSAGE_TL_LEN = 8; // Ctrl bit Pos constexpr int32_t CTRL_46_BIT = 46; constexpr int32_t CTRL_47_BIT = 47; constexpr int32_t CTRL_48_BIT = 48; constexpr int32_t CTRL_53_BIT = 53; // power param constexpr uint32_t TENSOR_TENSOR_FLOAT_POWER_FACTOR = 4; constexpr uint32_t TENSOR_TENSOR_INT_POWER_FACTOR = 6; constexpr uint32_t TENSOR_TENSOR_HALF_POWER_FACTOR = 7; constexpr uint32_t TENSOR_SCALAR_FLOAT_POWER_FACTOR = 5; constexpr uint32_t TENSOR_SCALAR_INT_POWER_FACTOR = 7; constexpr uint32_t TENSOR_SCALAR_HALF_POWER_FACTOR = 7; constexpr uint32_t POWER_TWO = 2; constexpr uint32_t POWER_THREE = 3; constexpr uint32_t POWER_INT32_BITS = 32; // int4b_t param constexpr uint32_t INT4_TWO = 2; constexpr uint32_t INT4_BIT_NUM = 4; // AddDeqRelu param constexpr int32_t DEQ_SHIFT_LEFT_17_BIT = 131072; constexpr float DEQ_SHIFT_RIGHT_17_BIT = 1.0 / DEQ_SHIFT_LEFT_17_BIT; constexpr int8_t ADDDEQRELU_MASK_MODE_ONE = 1; constexpr int8_t ADDDEQRELU_MASK_MODE_TWO = 2; #if (__CCE_AICORE__ <= 200) const int32_t TOTAL_VEC_LOCAL_SIZE = 248 * 1024; const uint32_t TOTAL_UB_SIZE = 256 * 1024; const uint32_t TMP_UB_OFFSET = 248 * 1024; const uint32_t TOTAL_L1_SIZE = 1024 * 1024; const uint32_t TOTAL_L0C_SIZE = 256 * 1024; #elif (__CCE_AICORE__ == 220) const int32_t TOTAL_VEC_LOCAL_SIZE = 184 * 1024; const uint32_t TOTAL_UB_SIZE = 192 * 1024; const uint32_t TMP_UB_OFFSET = 184 * 1024; const uint32_t TOTAL_L1_SIZE = 512 * 1024 - 128; const uint32_t SINGLE_MSG_SIZE = 64; const uint32_t CACHE_LINE_SIZE = 64; const uint32_t TOTAL_L0C_SIZE = 128 * 1024; #elif (__CCE_AICORE__ == 300) const int32_t TOTAL_VEC_LOCAL_SIZE = 184 * 1024; const uint32_t TOTAL_UB_SIZE = 248 * 1024; const uint32_t TMP_UB_OFFSET = 248 * 1024; const uint32_t TOTAL_L1_SIZE = 1024 * 1024; const uint32_t SINGLE_MSG_SIZE = 64; const uint32_t CACHE_LINE_SIZE = 64; const uint32_t TOTAL_L0C_SIZE = 128 * 1024; const uint32_t VECTOR_REG_WIDTH = 256; const uint32_t ONE_BLOCK_SIZE = 32; #elif (__CCE_AICORE__ == 310) const int32_t TOTAL_VEC_LOCAL_SIZE = 184 * 1024; const uint32_t TOTAL_UB_SIZE = 256 * 1024; const uint32_t TMP_UB_OFFSET = 248 * 1024; const uint32_t TOTAL_L1_SIZE = 1024 * 1024; const uint32_t SINGLE_MSG_SIZE = 64; const uint32_t CACHE_LINE_SIZE = 64; const uint32_t TOTAL_L0C_SIZE = 128 * 1024; const uint32_t VECTOR_REG_WIDTH = 256; const uint32_t ONE_BLOCK_SIZE = 32; #endif const uint8_t PAD_SIZE = 4; const uint8_t MRG_SORT_ELEMENT_LEN = 4; const uint8_t DEFAULT_DATA_COPY_NBURST = 1; const uint8_t DEFAULT_DATA_COPY_STRIDE = 0; const int32_t BLOCK_CUBE = 16; const int32_t CUBE_MAX_SIZE = 256; const int32_t BYTE_PER_FRACTAL = 512; const int32_t SRC_BURST_LEN_SIZE_ELE = 16; const int32_t SRC_GAP_SIZE_BYTE = 32; const int32_t DST_BURST_LEN_SIZE_ELE = 256; const int32_t VREDUCE_PER_REP_OUTPUT = 2; const uint16_t ONE_BLK_SIZE = 32; const uint16_t ONE_PARAM_SIZE = 8; const uint16_t AIV_CORE_NUM = 50; const uint16_t DUMP_MSG_HEAD_SIZE = 24; const int32_t ONE_REPEAT_BYTE_SIZE = 256; const int32_t FULL_MASK_LEN = 128; const int32_t HLAF_MASK_LEN = 64; const int32_t DEFAULT_REDUCE_DST_REP_SRIDE = 1; const uint8_t B64_BYTE_SIZE = 8; const uint8_t B32_BYTE_SIZE = 4; const uint8_t B16_BYTE_SIZE = 2; const uint8_t B32_DATA_NUM_PER_BLOCK = 8; const uint8_t B16_DATA_NUM_PER_BLOCK = 16; const int32_t B16_DATA_NUM_PER_REPEAT = 128; const int32_t B32_DATA_NUM_PER_REPEAT = 64; const int32_t BLOCK_STRIDE_POS_IN_SM = 16; const int32_t PLD_BUFFER_SIZE = 2; const uint8_t FIXPIPE_DEQ_TENSOR_SIZE = 16; const uint8_t SET_DATA_EXP_ZERO = 0; const uint8_t SET_DATA_EXP_ONE = 1; const uint8_t SET_DATA_EXP_TWO = 2; const uint8_t SET_DATA_EXP_THREE = 3; const uint8_t VDEQ_TENSOR_SIZE = 16; // workspace system reserve 16MB #if (__CCE_AICORE__ == 100) constexpr size_t RESERVED_WORKSPACE = 2 * 1024 * 1024; #elif (__CCE_AICORE__ == 200) constexpr size_t RESERVED_WORKSPACE = 2 * 1024 * 1024; #elif (__CCE_AICORE__ == 220) constexpr size_t RESERVED_WORKSPACE = 16 * 1024 * 1024; #elif (__CCE_AICORE__ == 300) constexpr size_t RESERVED_WORKSPACE = 16 * 1024 * 1024; #elif (__CCE_AICORE__ == 310) constexpr size_t RESERVED_WORKSPACE = 16 * 1024 * 1024; #endif // nchwconv address list size const int32_t NCHW_CONV_ADDR_LIST_SIZE = 16; const int32_t VA_REG_ARRAY_LEN = 8; const uint8_t CONV2D_IMG_SIZE = 2; const uint8_t CONV2D_KERNEL_SIZE = 2; const uint8_t CONV2D_STRIDE = 2; const uint8_t CONV2D_PAD = 4; const uint8_t CONV2D_DILATION = 2; const int32_t K_MAX_DIM = 8; const uint32_t TWO_OF_STACK_BUFFER = 2; const uint32_t THREE_OF_STACK_BUFFER = 3; const uint32_t HALF_REPEAT_SIZE = ONE_REPEAT_BYTE_SIZE / B16_BYTE_SIZE; const uint32_t FLOAT_REPEAT_SIZE = ONE_REPEAT_BYTE_SIZE / B32_BYTE_SIZE; const uint32_t ONE_REPEAT_FLOAT_SIZE = ONE_REPEAT_BYTE_SIZE / B32_BYTE_SIZE; const uint32_t ONE_REPEAT_HALF_SIZE = ONE_REPEAT_BYTE_SIZE / B16_BYTE_SIZE; const uint32_t MAX_REPEAT_FLOAT_SIZE = ONE_REPEAT_FLOAT_SIZE * MAX_REPEAT_TIMES; const uint32_t MAX_REPEAT_HALF_SIZE = ONE_REPEAT_HALF_SIZE * MAX_REPEAT_TIMES; const uint32_t ONE_BLK_HALF_NUM = ONE_BLK_SIZE / B16_BYTE_SIZE; const uint32_t ONE_BLK_FLOAT_NUM = ONE_BLK_SIZE / B32_BYTE_SIZE; const uint32_t BRCB_BROADCAST_NUMBER = 8; const uint32_t BRCB_MAX_REPEAT_SIZE = BRCB_BROADCAST_NUMBER * MAX_REPEAT_TIMES; const int32_t MIN_BLOCK_LEN = 1; const uint32_t PAIR_REDUCE_REPEAT_STRIDE_LEN = 128; const uint32_t PAIR_REDUCE_SUM_MERGES = 2; const uint32_t TWO_HUNDRED_FIFTY_TWO_REPEAT = 252; const uint32_t TWO_HUNDRED_FIFTY_TWO_REPEAT_BYTE_SIZE = TWO_HUNDRED_FIFTY_TWO_REPEAT * ONE_REPEAT_BYTE_SIZE; const uint32_t REDUCEV2_MODE_SEVEN = 7; const uint32_t DROPOUT_MODE_BYTE_MISALIGN = 1; const uint32_t DROPOUT_MODE_BYTE_ALIGN = 2; const uint32_t DROPOUT_MODE_BIT_ALIGN = 3; const uint32_t DROPOUT_MODE_BIT_MISALIGN = 4; const uint32_t REDUCEV2_MODE_ONE = 1; const uint32_t REDUCEV2_MODE_TWO = 2; const uint32_t REDUCEV2_MODE_THREE = 3; // 4dTrans param size const int32_t B8_TMP_ELE_LEN = 1024; const int32_t B16_TMP_ELE_LEN = 256; const int32_t B32_TMP_ELE_LEN = 128; const int32_t B8_TRANS_LEN = 1024; const int32_t B8_TRANS_FRACTAL = 512; const int32_t B8_TRANS_ROW = 32; const int32_t B8_COPY_COL = 32; // load3dPro config const uint64_t LOAD_M_START_POSITION = 48; const uint64_t LOAD_K_START_POSITION = 32; const uint64_t LOAD_M_EXTENSION = 16; const uint64_t LOAD_DILATION_FILTER_H = 40; const uint64_t LOAD_DILATION_FILTER_W = 32; const uint64_t LOAD_FILTER_H = 24; const uint64_t LOAD_FILTER_W = 16; const uint64_t LOAD_STRIDE_H = 8; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 // param check size const int32_t MAX_BLOCK_COUNT = 4095; const int32_t MIN_BLOCK_COUNT = 1; const int32_t MAX_BLOCK_LEN = 65535; const int32_t MAX_16BITS_STRIDE = 65535; const int32_t MAX_8BITS_STRIDE = 255; const int32_t MIN_BLOCK_NUM = 1; const int32_t MAX_PROPOSAL_MODE_NUM = 5; const int32_t MIN_PROPOSAL_MODE_NUM = 0; // load2d param size const int32_t MAX_LOAD2D_START_INDEX = 65535; const int32_t MIN_LOAD2D_START_INDEX = 0; const int32_t MAX_LOAD2D_SID = 15; const int32_t MIN_LOAD2D_SID = 0; // load3dv1 param size const int32_t MAX_LOAD3D_PAD = 255; const int32_t MIN_LOAD3D_PAD = 0; const int32_t MAX_LOAD3D_L1 = 32767; const int32_t MIN_LOAD3D_L1 = 1; const int32_t MAX_LOAD3D_C1_IDX = 4095; const int32_t MIN_LOAD3D_C1_IDX = 0; const int32_t MAX_LOAD3D_LEFT_TOP = 32767; const int32_t MIN_LOAD3D_LEFT_TOP = -255; const int32_t MAX_LOAD3D_STRIDE = 63; const int32_t MIN_LOAD3D_STRIDE = 1; const int32_t MAX_LOAD3D_FILTER = 255; const int32_t MIN_LOAD3D_FILTER = 1; const int32_t MIN_LOAD3D_FETCH_FILTER = 0; const int32_t MIN_LOAD3D_DILATION_FILTER = 1; const int32_t MAX_LOAD3D_JUMP_STRIDE = 127; const int32_t MIN_LOAD3D_JUMP_STRIDE = 1; const int32_t MAX_LOAD3D_REPEAT_MODE = 1; const int32_t MIN_LOAD3D_REPEAT_MODE = 0; const int32_t MIN_LOAD3D_REPEAT_TIMES = 1; const int32_t MAX_LOAD3D_CSIZE = 1; const int32_t MIN_LOAD3D_CSIZE = 0; // load3dv2 param size const int32_t MAX_LOAD3D_CHANNEL_SIZE = 65535; const int32_t MIN_LOAD3D_CHANNEL_SIZE = 1; const int32_t MAX_LOAD3D_EXTENSION = 65535; const int32_t MIN_LOAD3D_EXTENSION = 1; const int32_t MAX_LOAD3D_START_PT = 65535; const int32_t MIN_LOAD3D_START_PT = 0; // loadImageToL1 param size const int32_t MAX_LOADIMANG_L1_HORSIZE = 4095; const int32_t MIN_LOADIMANG_L1_HORSIZE = 0; const int32_t MAX_LOADIMANG_L1_VERSIZE = 4095; const int32_t MIN_LOADIMANG_L1_VERSIZE = 0; const int32_t MAX_LOADIMANG_L1_HWSTART = 4095; const int32_t MIN_LOADIMANG_L1_HWSTART = 0; const int32_t MAX_LOADIMANG_L1_SHORRES = 65535; const int32_t MIN_LOADIMANG_L1_SHORRES = 0; const int32_t MAX_LOADIMANG_L1_TBPADSIZE = 127; const int32_t MIN_LOADIMANG_L1_TBPADSIZE = 0; const int32_t MAX_LOADIMANG_L1_LRPADSIZE = 4095; const int32_t MIN_LOADIMANG_L1_LRPADSIZE = 0; // mmad param size const int32_t MAX_M_K_N_SIZE = 4095; const int32_t MIN_M_K_N_SIZE = 0; // mrgsort4 param size const int32_t MAX_SORT_ELE_LEN = 4095; const int32_t MIN_SORT_ELE_LEN = 0; const int32_t MIN_SORT_REPEAT_TIMES = 1; template std::string ScalarToString(T scalarValue); template <> inline std::string ScalarToString(half scalarValue) { return std::to_string(scalarValue.ToFloat()); } #if __CCE_AICORE__ >= 220 && __CCE_AICORE__ != 310 template <> inline std::string ScalarToString(bfloat16_t scalarValue) { return std::to_string(scalarValue.ToFloat()); } #endif template uint64_t GetScalarBitcode(T scalarValue); // deq tensor ptr could not be passed by cce instructions, so pass ptr to model by this function void SetModelDeqTensor(void* deqTensor); #if __CCE_AICORE__ == 200 void SetVbiSrc0Param(half* vbiSrc0Ptr, int32_t vbiSrc0Size); #endif void SetModelBiasTensor(void* biasTensor); void SetIndexMatrix(void* indexMatrix); // src0 of gatherb instr could not be accessed by cce instructions, so pass ptr to model by this function void SetModelGatherbSrc0Tensor(uint64_t src0, const uint32_t length); // dst0 of scatter instr could not be accessed by cce instructions, so pass ptr to model by this function void SetModelScatterDst0Tensor(uint64_t dst0, const uint32_t length); int32_t TensorWriteFile(const std::string& fileName, const void* buffer, size_t size); #endif // __CCE_KT_TEST__ template struct Conditional { using type = T1; }; template struct Conditional { using type = T2; }; template struct IntegerSubType { static int const kBits = bitNum; static bool const kSigned = sign; using T = typename Conditional::type; using Storage = uint8_t; static Storage const mask = Storage(((static_cast(1)) << static_cast(kBits)) - 1); Storage storage; __aicore__ inline IntegerSubType() = default; __aicore__ inline IntegerSubType(uint32_t value) : storage(reinterpret_cast(value) & mask) {} __aicore__ inline IntegerSubType(int32_t value) : storage(reinterpret_cast(value) & mask) {} __aicore__ inline operator T() const { if (kSigned && (storage & Storage(static_cast(1) << static_cast(kBits - 1)))) { // Sign extend return T(storage) | ~T(mask); } return T(storage); } __aicore__ inline bool operator == (IntegerSubType const &rhs) const { return storage == rhs.storage; } __aicore__ inline bool operator != (IntegerSubType const &rhs) const { return storage != rhs.storage; } __aicore__ inline bool operator > (IntegerSubType const &rhs) const { bool lhsIsNeg = (this->storage & (static_cast(1) << static_cast(this->kBits - 1))); bool rhsIsNeg = (rhs.storage & (static_cast(1) << static_cast(rhs.kBits - 1))); if (kSigned && (lhsIsNeg != rhsIsNeg)) { return (!lhsIsNeg) && rhsIsNeg; } return this->storage > rhs.storage; } __aicore__ inline bool operator >= (IntegerSubType const &rhs) const { bool lhsIsNeg = (this->storage & (static_cast(1) << static_cast(this->kBits - 1))); bool rhsIsNeg = (rhs.storage & (static_cast(1) << static_cast(rhs.kBits - 1))); if (kSigned && (lhsIsNeg != rhsIsNeg)) { return (!lhsIsNeg) && rhsIsNeg; } return storage >= rhs.storage; } __aicore__ inline bool operator < (IntegerSubType const &rhs) const { return !(*this >= rhs); } __aicore__ inline bool operator <= (IntegerSubType const &rhs) const { return !(*this > rhs); } }; using int4b_t = IntegerSubType; template struct SizeOfBits {}; template <> struct SizeOfBits { static int const value = INT4_BIT_NUM; }; #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 class ConstDefiner { public: static ConstDefiner& Instance() { static ConstDefiner instance; return instance; }; uint8_t* GetHardwareBaseAddr(Hardware hardPos) { ASCENDC_ASSERT((hardwareCpuBufferMap.find(hardPos) != hardwareCpuBufferMap.end()), { KERNEL_LOG(KERNEL_ERROR, "illegal hardPos %d", static_cast(hardPos)); }); return hardwareCpuBufferMap[hardPos]; } #if __CCE_AICORE__ <= 200 const std::map positionHardMap = { { TPosition::GM, Hardware::GM }, { TPosition::A1, Hardware::L1 }, { TPosition::B1, Hardware::L1 }, { TPosition::TSCM, Hardware::L1 }, { TPosition::VECIN, Hardware::UB }, { TPosition::VECOUT, Hardware::UB }, { TPosition::VECCALC, Hardware::UB }, { TPosition::A2, Hardware::L0A }, { TPosition::B2, Hardware::L0B }, { TPosition::C1, Hardware::L1 }, { TPosition::C2, Hardware::BIAS }, { TPosition::CO1, Hardware::L0C }, { TPosition::CO2, Hardware::UB }, }; #elif __CCE_AICORE__ == 220 const std::map positionHardMap = { { TPosition::GM, Hardware::GM }, { TPosition::A1, Hardware::L1 }, { TPosition::B1, Hardware::L1 }, { TPosition::TSCM, Hardware::L1 }, { TPosition::VECIN, Hardware::UB }, { TPosition::VECOUT, Hardware::UB }, { TPosition::VECCALC, Hardware::UB }, { TPosition::A2, Hardware::L0A }, { TPosition::B2, Hardware::L0B }, { TPosition::C1, Hardware::L1 }, { TPosition::C2, Hardware::BIAS }, { TPosition::CO1, Hardware::L0C }, { TPosition::CO2, Hardware::GM }, { TPosition::C2PIPE2GM, Hardware::FIXBUF }, }; #elif __CCE_AICORE__ == 300 const std::map positionHardMap = { { TPosition::GM, Hardware::GM }, { TPosition::A1, Hardware::L1 }, { TPosition::B1, Hardware::L1 }, { TPosition::TSCM, Hardware::L1 }, { TPosition::VECIN, Hardware::UB }, { TPosition::VECOUT, Hardware::UB }, { TPosition::VECCALC, Hardware::UB }, { TPosition::A2, Hardware::L0A }, { TPosition::B2, Hardware::L0B }, { TPosition::C1, Hardware::L1 }, { TPosition::C2, Hardware::BIAS }, { TPosition::CO1, Hardware::L0C }, { TPosition::CO2, Hardware::GM }, { TPosition::C2PIPE2GM, Hardware::FIXBUF }, }; #elif __CCE_AICORE__ == 310 const std::map positionHardMap = { { TPosition::GM, Hardware::GM }, { TPosition::A1, Hardware::L1 }, { TPosition::B1, Hardware::L1 }, { TPosition::TSCM, Hardware::L1 }, { TPosition::VECIN, Hardware::UB }, { TPosition::VECOUT, Hardware::UB }, { TPosition::VECCALC, Hardware::UB }, { TPosition::A2, Hardware::L0A }, { TPosition::B2, Hardware::L0B }, { TPosition::C1, Hardware::L1 }, { TPosition::C2, Hardware::BIAS }, { TPosition::CO1, Hardware::L0C }, { TPosition::CO2, Hardware::GM }, }; #endif #if __CCE_AICORE__ <= 200 const std::map bufferInitLen = { { Hardware::GM, 1024 * 1024 }, { Hardware::UB, 1024 * 256 }, { Hardware::L1, 1024 * 1024 }, { Hardware::L0A, 1024 * 64 }, { Hardware::L0B, 1024 * 64 }, { Hardware::L0C, 1024 * 256 }, { Hardware::BIAS, 1024 * 64 }, { Hardware::FIXBUF, 1024 * 64 }, }; #elif (__CCE_AICORE__ == 220) const std::map bufferInitLen = { { Hardware::GM, 1024 * 1024 }, { Hardware::UB, 1024 * 192 }, { Hardware::L1, 1024 * 512 }, { Hardware::L0A, 1024 * 64 }, { Hardware::L0B, 1024 * 64 }, { Hardware::L0C, 1024 * 128 }, { Hardware::BIAS, 1024 * 1 }, { Hardware::FIXBUF, 1024 * 7 }, }; #elif (__CCE_AICORE__ == 300) const std::map bufferInitLen = { { Hardware::GM, 1024 * 1024 }, { Hardware::UB, 1024 * 256 }, { Hardware::L1, 1024 * 1024 }, { Hardware::L0A, 1024 * 64 }, { Hardware::L0B, 1024 * 64 }, { Hardware::L0C, 1024 * 128 }, { Hardware::BIAS, 1024 * 1 }, { Hardware::FIXBUF, 1024 * 7 }, }; #elif (__CCE_AICORE__ == 310) const std::map bufferInitLen = { { Hardware::GM, 1024 * 1024 }, { Hardware::UB, 1024 * 256 }, { Hardware::L1, 1024 * 1024 }, { Hardware::L0A, 1024 * 64 }, { Hardware::L0B, 1024 * 64 }, { Hardware::L0C, 1024 * 128 }, { Hardware::BIAS, 1024 * 1 }, { Hardware::FIXBUF, 1024 * 7 }, }; #endif uint8_t* cpuGM; uint8_t* cpuUB; uint8_t* cpuL1; uint8_t* cpuL0A; uint8_t* cpuL0B; uint8_t* cpuL0C; uint8_t* cpuBIAS; uint8_t* cpuFIXBUF; std::map hardwareCpuBufferMap; private: ConstDefiner() { cpuGM = new uint8_t[bufferInitLen.at(Hardware::GM)]; cpuUB = new uint8_t[bufferInitLen.at(Hardware::UB)]; cpuL1 = new uint8_t[bufferInitLen.at(Hardware::L1)]; cpuL0A = new uint8_t[bufferInitLen.at(Hardware::L0A)]; cpuL0B = new uint8_t[bufferInitLen.at(Hardware::L0B)]; cpuL0C = new uint8_t[bufferInitLen.at(Hardware::L0C)]; cpuBIAS = new uint8_t[bufferInitLen.at(Hardware::BIAS)]; cpuFIXBUF = new uint8_t[bufferInitLen.at(Hardware::FIXBUF)]; hardwareCpuBufferMap = { { Hardware::UB, cpuUB }, { Hardware::L1, cpuL1 }, { Hardware::L0A, cpuL0A }, { Hardware::L0B, cpuL0B }, { Hardware::L0C, cpuL0C }, { Hardware::BIAS, cpuBIAS }, { Hardware::FIXBUF, cpuFIXBUF }, }; } ~ConstDefiner() { if (cpuGM != nullptr) { delete[] cpuGM; cpuGM = nullptr; } if (cpuUB != nullptr) { delete[] cpuUB; cpuUB = nullptr; } if (cpuL1 != nullptr) { delete[] cpuL1; cpuL1 = nullptr; } if (cpuL0A != nullptr) { delete[] cpuL0A; cpuL0A = nullptr; } if (cpuL0B != nullptr) { delete[] cpuL0B; cpuL0B = nullptr; } if (cpuL0C != nullptr) { delete[] cpuL0C; cpuL0C = nullptr; } if (cpuBIAS != nullptr) { delete[] cpuBIAS; cpuBIAS = nullptr; } if (cpuFIXBUF != nullptr) { delete[] cpuFIXBUF; cpuFIXBUF = nullptr; } } }; #endif /* input_format -> output_format; new_input_format -> new_output_format; new_input_shape -> new_output_shape; NHWC -> NC1HWC0 NHC -> NCHT [0,1*2,3] -> [0,1,2*3,4] ND -> FRACTAL_NZ HNC -> HCNT [0:-2,-2,-1] -> [0:-4,-4,-3*-2,-1] NDHWC -> FRACTAL_Z_3D NDHC -> DCHNT [0,1,2*3,4] -> [0,1,2*3,4*5,6] NC1HWC0 -> FRACTAL_Z NDHC -> DCHNT [0,(1),1*2*3,4]-> [(1),(1),0*1*2,3*4,5] NCDHW -> NDC1HWC0 NCDH -> NDCHT [0,1,2,3*4] -> [0,1,2,3*4,5] NCHW -> NC1HWC0 NCH -> NCHT [0,1,2*3] -> [0,1,2*3,4] HWCN -> FRACTAL_Z HCN -> CHNT [0*1,2,3] -> [0,1*2,3*4,5] DHWCN -> FRACTAL_Z_3D DHCN -> DCHNT [0,1*2,3,4] -> [0,1,2*3,4*5,6] ND -> FRACTAL_Z HCN -> HCNT [0:-2,-2,-1] -> [0:-4,-4,-3*-2,-1] NCHW -> FRACTAL_Z NCH -> CHNT [0,1,2*3] -> [0,1*2,3*4,5] NCDHW -> FRACTAL_Z_3D NCDH -> DCHNT [0,1,2,3*4] -> [0,1,2*3,4*5,6] NC1HWC0 -> NHWC NCHT -> NHC [0,1,2*3,4] -> [0,1*2,3] NDC1HWC0 -> NDHWC NCHT -> NHC [0*1,2,3*4,5] -> [0*1,2*3,4] FRACTAL_Z_3D -> NDHWC DCHNT -> NDHC [0,1,2*3,4*5,6] -> [0,1,2*3,4] FRACTAL_NZ -> NC1HWC0 DCHNT -> NDHC [(1),(1),0*1*2,3*4,5]-> [0,(1),1*2*3,4] NDC1HWC0 -> NCDHW NCHT -> NCDH [0,1,2,3*4,5] -> [0,1,2,3*4] NC1HWC0 -> NCHW NCHT -> NCH [0,1,2*3,4] -> [0,1,2*3] FRACTAL_Z -> HWCN CHNT -> HCN [0,1*2,3*4,5] -> [0*1,2,3] FRACTAL_Z_3D -> DHWCN DCHNT -> DHCN [0,1,2*3,4*5,6] -> [0,1*2,3,4] FRACTAL_Z -> NCHW CHNT -> NCH [0,1*2,3*4,5] -> [0,1,2*3] FRACTAL_Z_3D -> NCDHW DCHNT -> NCDH [0,1,2*3,4*5,6] -> [0,1,2,3*4] FRACTAL_Z -> ND HCNT -> HCN [0:-4,-4,-3*-2,-1] -> [0:-2,-2,-1] */ class MaskSetter { public: static MaskSetter& Instance() { static MaskSetter instance; return instance; }; void SetMask(bool setMask) { isSetMask = setMask; } bool GetMask() const { return isSetMask; } private: MaskSetter(){}; ~MaskSetter(){}; bool isSetMask = true; }; class Int4Setter { public: static Int4Setter& Instance() { static Int4Setter instance; return instance; }; void SetInt4() { isInt4 = true; } void SetDstInt4() { isDstInt4 = true; } void SetSrcInt4() { isSrcInt4 = true; } void ResetInt4() { isInt4 = false; } void ResetDstSrcInt4() { isDstInt4 = false; isSrcInt4 = false; } bool GetInt4() const { return isInt4; } bool GetDstInt4() const { return isDstInt4; } bool GetSrcInt4() const { return isSrcInt4; } private: Int4Setter(){}; ~Int4Setter(){}; bool isInt4 = false; bool isDstInt4 = false; bool isSrcInt4 = false; }; union NotNumUnion { __aicore__ NotNumUnion() {} float f; uint32_t i; }; enum class TShapeType : uint8_t { DEFAULT, NHWC, NC1HWC0, NHC, NCHT, ND, FRACTAL_NZ, HNC, HCNT, NDHWC, FRACTAL_Z_3D, NDHC, DCHNT, FRACTAL_Z, NCDHW, NDC1HWC0, NCDH, NDCHT, NCHW, NCH, HWCN, HCN, CHNT, DHWCN, DHCN }; enum class GatherMaskMode : uint8_t { VERSION_V1 = 0, VERSION_V2 = 1 }; #if (__CCE_AICORE__ == 220) const GatherMaskMode defaultGahterMaskMode = GatherMaskMode::VERSION_V2; #else const GatherMaskMode defaultGahterMaskMode = GatherMaskMode::VERSION_V1; #endif enum class CMPMODE : uint8_t { LT = 0, GT, EQ, LE, GE, NE, }; enum class RoundMode : uint8_t { CAST_NONE = 0, CAST_RINT, // round CAST_FLOOR, CAST_CEIL, CAST_ROUND, // away-zero CAST_TRUNC, // to-zero CAST_ODD, // Von Neumann rounding }; enum class SELMODE : uint8_t { VSEL_CMPMASK_SPR = 0, VSEL_TENSOR_SCALAR_MODE, VSEL_TENSOR_TENSOR_MODE, }; enum class BlockMode : uint8_t { BLOCK_MODE_NORMAL = 0, BLOCK_MODE_MATRIX, BLOCK_MODE_VECTOR, BLOCK_MODE_SMALL_CHANNEL, BLOCK_MODE_DEPTHWISE, }; enum class DeqScale : uint8_t { DEQ_NONE = 0, DEQ, VDEQ, DEQ8, VDEQ8, DEQ16, VDEQ16, }; enum class ReduceMode : uint8_t { REDUCE_MAX = 0, REDUCE_MIN, REDUCE_SUM, }; enum class ReduceOrder : uint8_t { ORDER_VALUE_INDEX = 0, ORDER_INDEX_VALUE, ORDER_ONLY_VALUE, ORDER_ONLY_INDEX, }; enum class DumpType : uint8_t { DUMP_DEFAULT = 0, DUMP_SCALAR, DUMP_TENSOR, DUMP_SHAPE, DUMP_ASSERT, DUMP_META, }; enum class CLAMPMODE { CLAMP_MAX = 0, CLAMP_MIN, }; enum class FmatrixMode : uint8_t { FMATRIX_LEFT = 0, FMATRIX_RIGHT = 1, }; enum class PcieCtrl : uint64_t { WR = 0, RD }; enum class DeQuantMode : uint8_t { DEQUANT_WITH_SINGLE_ROW = 0, // {1, m * n, n} = {m, n, n} DEQUANT_WITH_MULTI_ROW, // {1, m * n, n} != {m, n, n} }; struct CheckLocalMemoryIAParam { __aicore__ CheckLocalMemoryIAParam() { enableBit = 0; startAddr = 0; endAddr = 0; isScalarRead = false; isScalarWrite = false; isVectorRead = false; isVectorWrite = false; isMteRead = false; isMteWrite = false; isEnable = false; } __aicore__ CheckLocalMemoryIAParam(const uint8_t enableBitIn, const uint32_t startAddrIn, const uint32_t endAddrIn, const bool isScalarReadIn, const bool isScalarWriteIn, const bool isVectorReadIn, const bool isVectorWriteIn, const bool isMteReadIn, const bool isMteWriteIn, const bool isEnableIn) { enableBit = enableBitIn; startAddr = startAddrIn; endAddr = endAddrIn; isScalarRead = isScalarReadIn; isScalarWrite = isScalarWriteIn; isVectorRead = isVectorReadIn; isVectorWrite = isVectorWriteIn; isMteRead = isMteReadIn; isMteWrite = isMteWriteIn; isEnable = isEnableIn; } uint8_t enableBit = 0; uint32_t startAddr = 0; uint32_t endAddr = 0; bool isScalarRead = false; bool isScalarWrite = false; bool isVectorRead = false; bool isVectorWrite = false; bool isMteRead = false; bool isMteWrite = false; bool isEnable = false; uint32_t reserved = 0; }; struct ReduceRepeatParams { __aicore__ ReduceRepeatParams() { highMask = FULL_MASK; lowMask = FULL_MASK; repeatTimes = 0; dstRepStride = DEFAULT_REDUCE_DST_REP_SRIDE; // dst Stride Unit is 2B(fp16)/4B(fp32) srcBlkStride = DEFAULT_BLK_STRIDE; srcRepStride = DEFAULT_REPEAT_STRIDE; // src Stride Unit is 32B } __aicore__ ReduceRepeatParams(const int32_t mask, const int32_t repeatTimesIn, const int32_t dstRepStrideIn, const int32_t srcBlkStrideIn, const int32_t srcRepStrideIn) { constexpr int32_t doubleFactor = 2; #if __CCE_AICORE__ == 300 || __CCE_AICORE__ == 310 normalMask = mask; maskMode = 1; #else if (mask == HLAF_MASK_LEN) { highMask = 0; lowMask = FULL_MASK; } else if (mask == HLAF_MASK_LEN * doubleFactor) { highMask = FULL_MASK; lowMask = FULL_MASK; } else { highMask = (mask > HLAF_MASK_LEN) ? (((static_cast(1)) << static_cast(mask - HLAF_MASK_LEN)) - 1) : 0; lowMask = (mask > HLAF_MASK_LEN) ? FULL_MASK : (((static_cast(1)) << static_cast(mask)) - 1); } #endif repeatTimes = repeatTimesIn; dstRepStride = dstRepStrideIn; srcBlkStride = srcBlkStrideIn; srcRepStride = srcRepStrideIn; } __aicore__ ReduceRepeatParams(const uint64_t mask[2], const int32_t repeatTimesIn, const int32_t dstRepStrideIn, const int32_t srcBlkStrideIn, const int32_t srcRepStrideIn) { #if __CCE_AICORE__ == 300 || __CCE_AICORE__ == 310 bitMask[0] = mask[0]; bitMask[1] = mask[1]; #else highMask = mask[1]; lowMask = mask[0]; #endif repeatTimes = repeatTimesIn; dstRepStride = dstRepStrideIn; srcBlkStride = srcBlkStrideIn; srcRepStride = srcRepStrideIn; } uint64_t highMask = 0; uint64_t lowMask = 0; uint64_t bitMask[2] = {0, 0}; int32_t normalMask = 0; int32_t maskMode = 0; int32_t repeatTimes = 0; int32_t dstRepStride = 0; int32_t srcBlkStride = 0; int32_t srcRepStride = 0; }; struct DumpMessageHead { __aicore__ DumpMessageHead() { type = 0; lenth = 0; addr = 0; dataType = 0; desc = 0; bufferId = 0; position = 0; rsv = 0; } __aicore__ DumpMessageHead(uint32_t typeIn, uint32_t lenthIn, uint32_t addrIn, uint32_t dataTypeIn, uint32_t descIn, uint32_t bufferIdIn, uint32_t positionIn, uint32_t rsvIn) { type = typeIn; lenth = lenthIn; addr = addrIn; dataType = dataTypeIn; desc = descIn; bufferId = bufferIdIn; position = positionIn; rsv = rsvIn; } uint32_t type = 0; // Dump Type 1:DumpScalar(DUMP_SCALAR), 2:DumpTensor (DUMP_TENSOR) uint32_t lenth = 0; uint32_t addr = 0; // Dumptensor address, DumpScalar:0 uint32_t dataType = 0; // data type: int32_t/half/... uint32_t desc = 0; // for usr to add info or tag uint32_t bufferId = 0; // DumpScalar: Blockid, DumpTensor: UB adddr () uint32_t position = 0; // DumpScalar: 0: MIX, 1: AIC 2: AIV; DumpTensor: 1:UB, 2:L1 uint32_t rsv = 0; // reserved information }; struct DumpShapeMessageHead { __aicore__ DumpShapeMessageHead() { dim = 0; rsv = 0; for (int idx = 0; idx < K_MAX_SHAPE_DIM; ++idx) { shape[idx] = 0; } } __aicore__ DumpShapeMessageHead(uint32_t dimIn, uint32_t shapeIn[], uint32_t rsvIn = 0) { ASCENDC_ASSERT((dimIn <= K_MAX_SHAPE_DIM), { KERNEL_LOG(KERNEL_ERROR, "dim is %u, which should be less than %d", dimIn, K_MAX_SHAPE_DIM); }); dim = dimIn; rsv = rsvIn; for (int idx = 0; idx < K_MAX_SHAPE_DIM; ++idx) { if (idx < dim) { shape[idx] = shapeIn[idx]; } else { shape[idx] = 0; } } } uint32_t dim = 0; uint32_t shape[K_MAX_SHAPE_DIM]; uint32_t rsv = 0; // reserved information }; struct UnaryRepeatParams { __aicore__ UnaryRepeatParams() { blockNumber = DEFAULT_BLK_NUM; dstBlkStride = DEFAULT_BLK_STRIDE; srcBlkStride = DEFAULT_BLK_STRIDE; dstRepStride = DEFAULT_REPEAT_STRIDE; srcRepStride = DEFAULT_REPEAT_STRIDE; halfBlock = false; } __aicore__ UnaryRepeatParams(const uint16_t dstBlkStrideIn, const uint16_t srcBlkStrideIn, const uint8_t dstRepStrideIn, const uint8_t srcRepStrideIn) { dstBlkStride = dstBlkStrideIn; srcBlkStride = srcBlkStrideIn; dstRepStride = dstRepStrideIn; srcRepStride = srcRepStrideIn; } __aicore__ UnaryRepeatParams(const uint16_t dstBlkStrideIn, const uint16_t srcBlkStrideIn, const uint8_t dstRepStrideIn, const uint8_t srcRepStrideIn, const bool halfBlockIn) { dstBlkStride = dstBlkStrideIn; srcBlkStride = srcBlkStrideIn; dstRepStride = dstRepStrideIn; srcRepStride = srcRepStrideIn; halfBlock = halfBlockIn; } uint32_t blockNumber = 0; uint16_t dstBlkStride = 0; uint16_t srcBlkStride = 0; uint8_t dstRepStride = 0; uint8_t srcRepStride = 0; bool repeatStrideMode = false; bool strideSizeMode = false; bool halfBlock = false; }; struct ProposalIntriParams { __aicore__ ProposalIntriParams() { repeat = 0; modeNumber = 0; } __aicore__ ProposalIntriParams(const int32_t repeatTimes, const int32_t modeNumberIn) { repeat = repeatTimes; // [1,255] modeNumber = modeNumberIn; // modeNumberIn: 0: x1, 1: y1, 2: x2, 3: y2, 4: score, 5:label } int32_t repeat = 0; int32_t modeNumber = 0; }; struct BlockInfo { __aicore__ BlockInfo() { len = 0; core = 0; blockNum = 0; dumpOffset = 0; magic = 0; rsv = 0; dumpAddr = 0; } __aicore__ BlockInfo(uint64_t dumpAddrIn, uint32_t lenIn, uint32_t coreIn, uint32_t blockNumIn, uint32_t dumpOffsetIn, uint32_t magicIn, uint32_t rsvIn) { len = lenIn; core = coreIn; blockNum = blockNumIn; dumpOffset = dumpOffsetIn; magic = magicIn; rsv = rsvIn; dumpAddr = dumpAddrIn; } uint32_t len = 0; uint32_t core = 0; // current core id uint32_t blockNum = 0; // total core num uint32_t dumpOffset = 0; // size used by current core uint32_t magic = 0; // magic number uint32_t rsv = 0; uint64_t dumpAddr = 0; // start addr of dump }; struct DumpMeta { uint32_t typeId = static_cast(DumpType::DUMP_META); uint32_t len = 8; uint16_t blockDim = 0; uint8_t coreType = 0; uint8_t taskRation = 0; uint32_t rsv = 0; }; struct BinaryRepeatParams { __aicore__ BinaryRepeatParams() { blockNumber = DEFAULT_BLK_NUM; dstBlkStride = DEFAULT_BLK_STRIDE; src0BlkStride = DEFAULT_BLK_STRIDE; src1BlkStride = DEFAULT_BLK_STRIDE; dstRepStride = DEFAULT_REPEAT_STRIDE; src0RepStride = DEFAULT_REPEAT_STRIDE; src1RepStride = DEFAULT_REPEAT_STRIDE; } __aicore__ BinaryRepeatParams(const uint8_t dstBlkStrideIn, const uint8_t src0BlkStrideIn, const uint8_t src1BlkStrideIn, const uint8_t dstRepStrideIn, const uint8_t src0RepStrideIn, const uint8_t src1RepStrideIn) { dstBlkStride = dstBlkStrideIn; src0BlkStride = src0BlkStrideIn; src1BlkStride = src1BlkStrideIn; dstRepStride = dstRepStrideIn; src0RepStride = src0RepStrideIn; src1RepStride = src1RepStrideIn; } uint32_t blockNumber = 0; uint8_t dstBlkStride = 0; uint8_t src0BlkStride = 0; uint8_t src1BlkStride = 0; uint8_t dstRepStride = 0; uint8_t src0RepStride = 0; uint8_t src1RepStride = 0; bool repeatStrideMode = false; bool strideSizeMode = false; }; struct VdeqInfo { __aicore__ VdeqInfo() {} __aicore__ VdeqInfo(const float vdeqScaleIn[VDEQ_TENSOR_SIZE], const int16_t vdeqOffsetIn[VDEQ_TENSOR_SIZE], const bool vdeqSignModeIn[VDEQ_TENSOR_SIZE]) { for (int32_t i = 0; i < VDEQ_TENSOR_SIZE; ++i) { vdeqScale[i] = vdeqScaleIn[i]; vdeqOffset[i] = vdeqOffsetIn[i]; vdeqSignMode[i] = vdeqSignModeIn[i]; } } float vdeqScale[VDEQ_TENSOR_SIZE] = { 0 }; int16_t vdeqOffset[VDEQ_TENSOR_SIZE] = { 0 }; bool vdeqSignMode[VDEQ_TENSOR_SIZE] = { 0 }; }; struct GatherRepeatParams { __aicore__ GatherRepeatParams() { blockNumber = DEFAULT_BLK_NUM; dstBlkStride = DEFAULT_BLK_STRIDE; src0BlkStride = DEFAULT_BLK_STRIDE; src1BlkStride = DEFAULT_BLK_STRIDE; dstRepStride = DEFAULT_REPEAT_STRIDE; src0RepStride = DEFAULT_REPEAT_STRIDE; src1RepStride = DEFAULT_REPEAT_STRIDE; } __aicore__ GatherRepeatParams(const uint8_t dstBlkStrideIn, const uint8_t dstRepStrideIn) { dstBlkStride = dstBlkStrideIn; dstRepStride = dstRepStrideIn; } uint32_t blockNumber = DEFAULT_BLK_NUM; uint16_t dstRepStride = DEFAULT_REPEAT_STRIDE; uint8_t dstBlkStride = DEFAULT_BLK_STRIDE; uint8_t src0BlkStride = DEFAULT_BLK_STRIDE; uint8_t src1BlkStride = DEFAULT_BLK_STRIDE; uint8_t src0RepStride = DEFAULT_REPEAT_STRIDE; uint8_t src1RepStride = DEFAULT_REPEAT_STRIDE; bool repeatStrideMode = false; bool strideSizeMode = false; }; struct BrcbRepeatParams { __aicore__ BrcbRepeatParams() { blockNumber = DEFAULT_BLK_NUM; dstBlkStride = DEFAULT_BLK_STRIDE; src0BlkStride = DEFAULT_BLK_STRIDE; src1BlkStride = DEFAULT_BLK_STRIDE; dstRepStride = DEFAULT_REPEAT_STRIDE; src0RepStride = DEFAULT_REPEAT_STRIDE; src1RepStride = DEFAULT_REPEAT_STRIDE; } __aicore__ BrcbRepeatParams(const uint16_t dstBlkStrideIn, const uint16_t dstRepStrideIn) { dstBlkStride = dstBlkStrideIn; dstRepStride = dstRepStrideIn; } uint32_t blockNumber = DEFAULT_BLK_NUM; uint16_t dstRepStride = DEFAULT_REPEAT_STRIDE; uint16_t dstBlkStride = DEFAULT_BLK_STRIDE; uint8_t src0BlkStride = DEFAULT_BLK_STRIDE; uint8_t src1BlkStride = DEFAULT_BLK_STRIDE; uint8_t src0RepStride = DEFAULT_REPEAT_STRIDE; uint8_t src1RepStride = DEFAULT_REPEAT_STRIDE; bool repeatStrideMode = false; bool strideSizeMode = false; }; // MM intr params using LoadData2dParams = struct LoadData2DParams; struct LoadData2DParams { __aicore__ LoadData2DParams() { startIndex = 0; repeatTimes = 0; srcStride = 0; sid = 0; dstGap = 0; ifTranspose = false; addrMode = 0; } __aicore__ LoadData2DParams(const uint16_t startIndexIn, const uint8_t repeatTimesIn, const uint16_t srcStrideIn, const uint8_t sidIn, const uint16_t dstGapIn, const bool ifTransposeIn, const uint8_t addrModeIn) { startIndex = startIndexIn; repeatTimes = repeatTimesIn; srcStride = srcStrideIn; sid = sidIn; dstGap = dstGapIn; ifTranspose = ifTransposeIn; addrMode = addrModeIn; } uint16_t startIndex = 0; uint16_t dstGap = 0; uint16_t srcStride = 0; bool ifTranspose = 0; uint8_t repeatTimes = 0; uint8_t sid = 0; uint8_t addrMode = 0; }; struct LoadData2dTransposeParams { __aicore__ LoadData2dTransposeParams() { startIndex = 0; repeatTimes = 0; srcStride = 0; dstGap = 0; dstFracGap = 0; addrMode = 0; } __aicore__ LoadData2dTransposeParams(const uint16_t startIndexIn, const uint8_t repeatTimesIn, const uint16_t srcStrideIn, const uint16_t dstGapIn, const uint16_t dstfracGapIn, const uint8_t addrModeIn) { startIndex = startIndexIn; repeatTimes = repeatTimesIn; srcStride = srcStrideIn; dstGap = dstGapIn; dstFracGap = dstfracGapIn; addrMode = addrModeIn; } __aicore__ LoadData2dTransposeParams(const uint16_t startIndexIn, const uint8_t repeatTimesIn, const uint16_t srcStrideIn, const uint16_t dstGapIn, const uint16_t dstfracGapIn) { startIndex = startIndexIn; repeatTimes = repeatTimesIn; srcStride = srcStrideIn; dstGap = dstGapIn; dstFracGap = dstfracGapIn; } uint16_t startIndex = 0; uint8_t repeatTimes = 0; uint16_t srcStride = 0; uint16_t dstGap = 0; uint16_t dstFracGap = 0; uint8_t addrMode = 0; }; template struct LoadData3DParamsV1 { __aicore__ LoadData3DParamsV1() { for (int32_t i = 0; i < PAD_SIZE; ++i) { padList[i] = 0; } l1H = 0; l1W = 0; c1Index = 0; fetchFilterW = 0; fetchFilterH = 0; leftTopW = 0; leftTopH = 0; strideW = 0; strideH = 0; filterW = 0; filterH = 0; dilationFilterW = 0; dilationFilterH = 0; jumpStride = 0; repeatMode = 0; repeatTime = 0; cSize = 0; padValue = 0; } __aicore__ LoadData3DParamsV1(const uint8_t padListIn[PAD_SIZE], const uint16_t l1HIn, const uint16_t l1WIn, const uint16_t c1IndexIn, const uint8_t fetchFilterWIn, const uint8_t fetchFilterHIn, const int16_t leftTopWIn, const int16_t leftTopHIn, const uint8_t strideWIn, const uint8_t strideHIn, const uint8_t filterWIn, const uint8_t filterHIn, const uint8_t dilationFilterWIn, const uint8_t dilationFilterHIn, const uint8_t jumpStrideIn, const uint8_t repeatModeIn, const uint8_t repeatTimeIn, const uint8_t cSizeIn, const T padValueIn) { for (int32_t i = 0; i < PAD_SIZE; ++i) { padList[i] = padListIn[i]; } l1H = l1HIn; l1W = l1WIn; c1Index = c1IndexIn; fetchFilterW = fetchFilterWIn; fetchFilterH = fetchFilterHIn; leftTopW = leftTopWIn; leftTopH = leftTopHIn; strideW = strideWIn; strideH = strideHIn; filterW = filterWIn; filterH = filterHIn; dilationFilterW = dilationFilterWIn; dilationFilterH = dilationFilterHIn; jumpStride = jumpStrideIn; repeatMode = repeatModeIn; repeatTime = repeatTimeIn; cSize = cSizeIn; padValue = padValueIn; } uint8_t padList[PAD_SIZE] = {0}; uint8_t strideW = 0; uint8_t strideH = 0; uint8_t filterW = 0; uint8_t filterH = 0; uint8_t dilationFilterW = 0; uint8_t dilationFilterH = 0; uint8_t jumpStride = 0; uint8_t repeatMode = 0; uint8_t repeatTime = 0; uint8_t cSize = 0; T padValue = 0; uint8_t fetchFilterW = 0; uint8_t fetchFilterH = 0; uint16_t l1H = 0; uint16_t l1W = 0; uint16_t c1Index = 0; int16_t leftTopW = 0; int16_t leftTopH = 0; }; template struct LoadData3DParamsV2 { __aicore__ LoadData3DParamsV2() { for (int32_t i = 0; i < PAD_SIZE; ++i) { padList[i] = 0; } l1H = 0; l1W = 0; channelSize = 0; kExtension = 0; mExtension = 0; kStartPt = 0; mStartPt = 0; strideW = 1; strideH = 1; filterW = 1; filterH = 1; dilationFilterW = 1; dilationFilterH = 1; enTranspose = false; enSmallK = false; padValue = 0; filterSizeW = false; filterSizeH = false; fMatrixCtrl = false; } __aicore__ LoadData3DParamsV2(const uint8_t padListIn[PAD_SIZE], const uint16_t l1HIn, const uint16_t l1WIn, const uint16_t channelSizeIn, const uint16_t kExtensionIn, const uint16_t mExtensionIn, const uint16_t kStartPtIn, const uint16_t mStartPtIn, const uint8_t strideWIn, const uint8_t strideHIn, const uint8_t filterWIn, const uint8_t filterHIn, const uint8_t dilationFilterWIn, const uint8_t dilationFilterHIn, const bool enTransposeIn, const bool enSmallKIn, const T padValueIn) { for (int32_t i = 0; i < PAD_SIZE; ++i) { padList[i] = padListIn[i]; } l1H = l1HIn; l1W = l1WIn; channelSize = channelSizeIn; kExtension = kExtensionIn; mExtension = mExtensionIn; kStartPt = kStartPtIn; mStartPt = mStartPtIn; strideW = strideWIn; strideH = strideHIn; filterW = filterWIn; filterH = filterHIn; dilationFilterW = dilationFilterWIn; dilationFilterH = dilationFilterHIn; enTranspose = enTransposeIn; enSmallK = enSmallKIn; padValue = padValueIn; filterSizeW = false; filterSizeH = false; fMatrixCtrl = false; } __aicore__ LoadData3DParamsV2(const uint8_t padListIn[PAD_SIZE], const uint16_t l1HIn, const uint16_t l1WIn, const uint16_t channelSizeIn, const uint16_t kExtensionIn, const uint16_t mExtensionIn, const uint16_t kStartPtIn, const uint16_t mStartPtIn, const uint8_t strideWIn, const uint8_t strideHIn, const uint8_t filterWIn, const uint8_t filterHIn, const uint8_t dilationFilterWIn, const uint8_t dilationFilterHIn, const bool enTransposeIn, const bool enSmallKIn, const T padValueIn, const bool filterSizeWIn, const bool filterSizeHIn, const bool fMatrixCtrlIn) { for (int32_t i = 0; i < PAD_SIZE; ++i) { padList[i] = padListIn[i]; } l1H = l1HIn; l1W = l1WIn; channelSize = channelSizeIn; kExtension = kExtensionIn; mExtension = mExtensionIn; kStartPt = kStartPtIn; mStartPt = mStartPtIn; strideW = strideWIn; strideH = strideHIn; filterW = filterWIn; filterH = filterHIn; dilationFilterW = dilationFilterWIn; dilationFilterH = dilationFilterHIn; enTranspose = enTransposeIn; enSmallK = enSmallKIn; padValue = padValueIn; filterSizeW = filterSizeWIn; filterSizeH = filterSizeHIn; fMatrixCtrl = fMatrixCtrlIn; } uint8_t padList[PAD_SIZE] = {0}; uint16_t l1H = 0; uint16_t l1W = 0; uint16_t channelSize = 0; uint16_t kExtension = 0; uint16_t mExtension = 0; uint16_t kStartPt = 0; uint16_t mStartPt = 0; uint8_t strideW = 1; uint8_t strideH = 1; uint8_t filterW = 1; uint8_t filterH = 1; uint8_t dilationFilterW = 1; uint8_t dilationFilterH = 1; bool enTranspose = false; bool enSmallK = false; T padValue = 0; bool filterSizeW = false; bool filterSizeH = false; bool fMatrixCtrl = false; }; struct LoadData3DParamsV2Pro { __aicore__ LoadData3DParamsV2Pro() { channelSize = 0; enTranspose = false; enSmallK = false; filterSizeW = false; filterSizeH = false; fMatrixCtrl = false; extConfig = 0; filterConfig = 0X10101010101; } __aicore__ LoadData3DParamsV2Pro(const uint16_t channelSizeIn, const bool enTransposeIn, const bool enSmallKIn, const bool filterSizeWIn, const bool filterSizeHIn, const bool fMatrixCtrlIn, const uint64_t extConfigIn, const uint64_t filterConfigIn) { channelSize = channelSizeIn; enTranspose = enTransposeIn; enSmallK = enSmallKIn; filterSizeW = filterSizeWIn; filterSizeH = filterSizeHIn; fMatrixCtrl = fMatrixCtrlIn; extConfig = extConfigIn; filterConfig = filterConfigIn; } uint16_t channelSize = 0; bool enTranspose = false; bool enSmallK = false; bool filterSizeW = false; bool filterSizeH = false; bool fMatrixCtrl = false; uint64_t extConfig = 0; uint64_t filterConfig = 0X10101010101; }; template struct InitConstValueParams { __aicore__ InitConstValueParams() { repeatTimes = 0; blockNum = 0; dstGap = 0; initValue = 0; } __aicore__ InitConstValueParams(const uint16_t repeatTimesIn, const uint16_t blockNumIn, const uint16_t dstGapIn, const U initValueIn) { repeatTimes = repeatTimesIn; blockNum = blockNumIn; dstGap = dstGapIn; initValue = initValueIn; } __aicore__ InitConstValueParams(const uint16_t repeatTimesIn, const U initValueIn) { repeatTimes = repeatTimesIn; initValue = initValueIn; } uint16_t repeatTimes = 0; uint16_t blockNum = 0; uint16_t dstGap = 0; U initValue = 0; }; struct LoadImageToL1Params { __aicore__ LoadImageToL1Params() { horSize = 0; verSize = 0; horStartP = 0; verStartP = 0; sHorRes = 0; topPadSize = 0; botPadSize = 0; lPadSize = 0; rPadSize = 0; } __aicore__ LoadImageToL1Params(const uint16_t horSizeIn, const uint16_t verSizeIn, const uint16_t horStartPIn, const uint16_t verStartPIn, const uint16_t sHorResIn, const uint8_t topPadSizeIn, const uint8_t botPadSizeIn, const uint16_t lPadSizeIn, const uint16_t rPadSizeIn) { horSize = horSizeIn; verSize = verSizeIn; horStartP = horStartPIn; verStartP = verStartPIn; sHorRes = sHorResIn; topPadSize = topPadSizeIn; botPadSize = botPadSizeIn; lPadSize = lPadSizeIn; rPadSize = rPadSizeIn; } uint16_t horSize = 0; uint16_t verSize = 0; uint16_t horStartP = 0; uint16_t verStartP = 0; uint16_t sHorRes = 0; uint8_t topPadSize = 0; uint8_t botPadSize = 0; uint16_t lPadSize = 0; uint16_t rPadSize = 0; uint8_t sid = 0; }; struct LoadDataRepeatParam { __aicore__ LoadDataRepeatParam() { repeatStride = 0; repeatTime = 1; repeatMode = 0; reserved = 0; } __aicore__ LoadDataRepeatParam(const uint16_t repeatStrideIn, const uint8_t repeatTimeIn, const uint8_t repeatModeIn) { repeatStride = repeatStrideIn; repeatTime = repeatTimeIn; repeatMode = repeatModeIn; } uint16_t repeatStride = 0; uint8_t repeatTime = 1; uint8_t repeatMode = 0; uint8_t reserved = 0; }; enum class LoopMode : uint8_t { MODE_NM = 0, MODE_MN = 1, MODE_KM = 2, MODE_KN = 3 }; struct GemmTiling { __aicore__ GemmTiling() { mIterNum = 1; nIterNum = 1; kIterNum = 1; loopMode = LoopMode::MODE_NM; } const uint32_t blockSize = 16; LoopMode loopMode = LoopMode::MODE_NM; uint32_t mNum = 0; uint32_t nNum = 0; uint32_t kNum = 0; uint32_t roundM = 0; uint32_t roundN = 0; uint32_t roundK = 0; uint32_t c0Size = 32; uint32_t dtypeSize = 1; uint32_t mBlockNum = 0; uint32_t nBlockNum = 0; uint32_t kBlockNum = 0; uint32_t mIterNum = 0; uint32_t nIterNum = 0; uint32_t kIterNum = 0; uint32_t mTileBlock = 0; uint32_t nTileBlock = 0; uint32_t kTileBlock = 0; uint32_t kTailBlock = 0; uint32_t mTailBlock = 0; uint32_t nTailBlock = 0; bool kHasTail = false; bool mHasTail = false; bool nHasTail = false; bool kHasTailEle = false; uint32_t kTailEle = 0; }; struct Conv2dParams { __aicore__ Conv2dParams() {} __aicore__ Conv2dParams(const uint32_t imgShapeIn[CONV2D_IMG_SIZE], const uint32_t kernelShapeIn[CONV2D_KERNEL_SIZE], const uint32_t strideIn[CONV2D_STRIDE], const uint32_t cinIn, const uint32_t coutIn, const uint32_t padListIn[CONV2D_PAD], const uint32_t dilationIn[CONV2D_DILATION], const uint32_t initYIn, const bool partialSumIn) { for (int32_t i = 0; i < CONV2D_IMG_SIZE; ++i) { imgShape[i] = imgShapeIn[i]; } for (int32_t i = 0; i < CONV2D_KERNEL_SIZE; ++i) { kernelShape[i] = kernelShapeIn[i]; } for (int32_t i = 0; i < CONV2D_STRIDE; ++i) { stride[i] = strideIn[i]; } cin = cinIn; cout = coutIn; for (int32_t i = 0; i < CONV2D_PAD; ++i) { padList[i] = padListIn[i]; } for (int32_t i = 0; i < CONV2D_DILATION; ++i) { dilation[i] = dilationIn[i]; } initY = initYIn; partialSum = partialSumIn; } uint32_t imgShape[CONV2D_IMG_SIZE] = { 0 }; // [H, W] uint32_t kernelShape[CONV2D_KERNEL_SIZE] = { 0 }; // [Kh, Kw] uint32_t stride[CONV2D_STRIDE] = { 0 }; // [stride_h, stride_w] uint32_t cin = 0; // cin = C0 * C1; uint32_t cout = 0; uint32_t padList[CONV2D_PAD] = { 0 }; // [pad_left, pad_right, pad_top, pad_bottom] uint32_t dilation[CONV2D_DILATION] = { 0 }; // [dilation_h, dilation_w] uint32_t initY = 0; bool partialSum = false; }; struct Conv2dTilling { const uint32_t blockSize = 16; // # M block size is always 16 LoopMode loopMode = LoopMode::MODE_NM; uint32_t c0Size = 32; uint32_t dTypeSize = 1; uint32_t strideH = 0; uint32_t strideW = 0; uint32_t dilationH = 0; uint32_t dilationW = 0; uint32_t hi = 0; uint32_t wi = 0; uint32_t ho = 0; uint32_t wo = 0; uint32_t height = 0; uint32_t width = 0; uint32_t howo = 0; uint32_t mNum = 0; uint32_t nNum = 0; uint32_t kNum = 0; uint32_t mBlockNum = 0; uint32_t kBlockNum = 0; uint32_t nBlockNum = 0; uint32_t roundM = 0; uint32_t roundN = 0; uint32_t roundK = 0; uint32_t mTileBlock = 0; uint32_t nTileBlock = 0; uint32_t kTileBlock = 0; uint32_t mIterNum = 0; uint32_t nIterNum = 0; uint32_t kIterNum = 0; uint32_t mTileNums = 0; bool mHasTail = false; bool nHasTail = false; bool kHasTail = false; uint32_t kTailBlock = 0; uint32_t mTailBlock = 0; uint32_t nTailBlock = 0; uint32_t mTailNums = 0; }; struct MmadParams { __aicore__ MmadParams() { m = 0; n = 0; k = 0; unitFlag = 0; kDirectionAlign = false; cmatrixSource = false; cmatrixInitVal = true; } __aicore__ MmadParams(const uint16_t mIn, const uint16_t nIn, const uint16_t kIn, const bool isBiasIn, const int32_t fmOffsetIn, const bool enSsparseIn, const bool enWinogradAIn, const bool enWinogradBIn) { m = mIn; n = nIn; k = kIn; isBias = isBiasIn; fmOffset = fmOffsetIn; enSsparse = enSsparseIn; enWinogradA = enWinogradAIn; enWinogradB = enWinogradBIn; } __aicore__ MmadParams(const uint16_t mIn, const uint16_t nIn, const uint16_t kIn, const uint8_t unitFlagIn, const bool cmatrixSourceIn, const bool cmatrixInitValIn) { m = mIn; n = nIn; k = kIn; unitFlag = unitFlagIn; cmatrixSource = cmatrixSourceIn; cmatrixInitVal = cmatrixInitValIn; } uint16_t m = 0; uint16_t n = 0; uint16_t k = 0; // Indicates whether to accumulate the initial matrix, 0: matrix multiplication, 1: matrix multiplication and // addition bool isBias = false; // Left matrix offset int32_t fmOffset = 0; // Enable the structured sparse feature, default value is false bool enSsparse = false; // Indicates whether matrix a is generated by winograd_feature_map_transform, default value is false; bool enWinogradA = false; // Indicates whether matrix b is generated by winograd_feature_map_transform, default value is false; bool enWinogradB = false; uint8_t unitFlag = 0; bool kDirectionAlign = false; // Indicates the C matrix source, 1: the C matrix is in bias table buffer, 0: the C matrix is in L0C bool cmatrixSource = false; // Indicates the initial matrix, 1: the number in C matrix is 0, 0：use the real number in C matrix bool cmatrixInitVal = true; }; struct MatMulInfo { const uint16_t m{ 0 }; const uint16_t n{ 0 }; const uint16_t k{ 0 }; const bool isInitOut{ false }; const bool isBias{ false }; }; struct DropOutShapeInfo { __aicore__ DropOutShapeInfo(){}; uint32_t firstAxis = 0; uint32_t srcLastAxis = 0; uint32_t maskLastAxis = 0; }; struct SelectWithBytesMaskShapeInfo { __aicore__ SelectWithBytesMaskShapeInfo(){}; uint32_t firstAxis = 0; uint32_t srcLastAxis = 0; uint32_t maskLastAxis = 0; }; template class LocalTensor; template class GlobalTensor; template struct LayerNormParams { __aicore__ LayerNormParams(){}; LocalTensor tempTensorA; LocalTensor tempTensorB; LocalTensor tempTensorC; LocalTensor meanTmpTensor; LocalTensor varianceTmpTensor; }; template struct BatchNormParams { __aicore__ BatchNormParams(){}; float firstDimValueBack = 1.0; uint8_t srcRepeatStride = 1; uint32_t srcOffset = 1; uint32_t basicLoop = 0; uint32_t brcRepeatTimes = 0; uint32_t oriBloop = 0; uint32_t oriBTail = 0; uint32_t oriBTmpLoopOffset = 0; uint32_t oriBTmpTailOffset = 0; uint32_t oriBOutLoopOffset = 0; uint32_t oriBOutTailOffset = 0; uint32_t reduceAddLoop = 0; uint32_t reduceAddTail = 0; uint32_t reduceAddTailOffset = 0; LocalTensor tempTensorA; LocalTensor tempTensorB; LocalTensor tempTensorC; LocalTensor meanTmpTensor; LocalTensor varianceTmpTensor; }; template struct DeepNormParams { __aicore__ DeepNormParams(){}; float lastDimValueBack = 1.0; LocalTensor tempTensorA; LocalTensor tempTensorB; LocalTensor tempTensorC; LocalTensor meanTmpTensor; LocalTensor varianceTmpTensor; }; template struct ExpParams { __aicore__ ExpParams() {}; uint32_t inputSize = 0; // total data num uint32_t oneTmpSize = 0; // data num in one tmp buffer uint32_t firstTmpStartPos = 0; // first tmp buffer start position uint32_t secondTmpStartPos = 0; // second tmp buffer start position uint32_t thirdTmpStartPos = 0; // third tmp buffer start position uint32_t fourthTmpStartPos = 0; // fourth tmp buffer start position uint32_t loopNum = 0; // how many loop for main block calculation uint32_t tailSize = 0; // tail block uint32_t tailPos = 0; // tail block start pos uint32_t curDataLength = 0; // current data num for calculation uint32_t expandLevel = 0; // taylor param expand level LocalTensor tempTensorFloorX; // FP32 (x - floor(x)) for Taylor calculation LocalTensor tempTensorFloorXPow; // FP32 to calculate (x - floor(x)) ^ n LocalTensor tempTensorRes; // FP32 result to store sum LocalTensor tempTensorIntPart; // FP32 exp(floor(x)) result }; template struct AntiquantParams { __aicore__ AntiquantParams() {}; LocalTensor tempTensorOffset; // FP32 offset after brcb 8 * N LocalTensor tempTensorScale; // FP32 scale after brcb 8 * N LocalTensor tempTensorInput; // partial FP32 input after cast }; template struct DropOutParams { __aicore__ DropOutParams() {}; uint32_t dataSize = 0; uint32_t stackBufferSize = 0; uint32_t repeatTimes = 1; uint32_t maxRepeatSize = 0; uint32_t oneRepeatSize = 0; uint32_t currentSize = 0; uint32_t repeatRounding = 0; uint32_t repeatRemaining = 0; uint32_t repeatTail = 0; LocalTensor firstLocal; LocalTensor secondLocal; }; template struct PowerFParams { __aicore__ PowerFParams(){}; LocalTensor tmpTensor1; LocalTensor tmpTensor2; LocalTensor tmpTensor3; LocalTensor tmpMask1; LocalTensor tmpMask2; LocalTensor tmpMask3; LocalTensor finiteIntegerYMask; }; template struct PowerIParams { __aicore__ PowerIParams(){}; float expIterateSum; LocalTensor expUBIterate; LocalTensor oriAbsExp; LocalTensor recordExpNode; LocalTensor tmpTensor1; LocalTensor tmpTensor2; LocalTensor negMask; LocalTensor mask; LocalTensor tmpScalar; }; template struct GeluParams { __aicore__ GeluParams(){}; uint32_t repeatTimes = 1; uint32_t currentSize = 0; uint32_t repeatRounding = 0; uint32_t repeatRemaining = 0; uint32_t tail = 0; uint32_t maxRepeatSize = 0; uint32_t oneRepeatSize = 0; uint32_t dataSize = 0; uint32_t stackSize = 0; uint32_t tmpBufferSize = 0; LocalTensor sharedTmpBuffer; LocalTensor tempTensorConv; LocalTensor tempTensorA; LocalTensor tempTensorB; LocalTensor tempTensorC; }; template struct TanhParams { __aicore__ TanhParams(){}; uint32_t repeatTimes = 1; uint32_t calCount = 0; uint32_t stackSize = 0; uint32_t tmpBufferSize = 0; LocalTensor sharedTmpBuffer; LocalTensor tempTensorConv; LocalTensor tmpClip; }; template struct AscendDequantParams { __aicore__ AscendDequantParams(){}; uint64_t tmpSize; LocalTensor tmpAddrA; LocalTensor tmpAddrB; }; template struct MrgSortSrcList { __aicore__ MrgSortSrcList() {} __aicore__ MrgSortSrcList(const LocalTensor& src1In, const LocalTensor& src2In, const LocalTensor& src3In, const LocalTensor& src4In) { src1 = src1In[0]; src2 = src2In[0]; src3 = src3In[0]; src4 = src4In[0]; } LocalTensor src1; LocalTensor src2; LocalTensor src3; LocalTensor src4; }; struct MrgSort4Info { __aicore__ MrgSort4Info() {} __aicore__ MrgSort4Info(const uint16_t elementLengthsIn[MRG_SORT_ELEMENT_LEN], const bool ifExhaustedSuspensionIn, const uint16_t validBitIn, const uint16_t repeatTimesIn) { for (int32_t i = 0; i < MRG_SORT_ELEMENT_LEN; ++i) { elementLengths[i] = elementLengthsIn[i]; } ifExhaustedSuspension = ifExhaustedSuspensionIn; validBit = validBitIn; repeatTimes = repeatTimesIn; } uint16_t elementLengths[MRG_SORT_ELEMENT_LEN] = { 0 }; bool ifExhaustedSuspension = false; uint16_t validBit = 0; uint8_t repeatTimes = 1; }; struct DataCopyParams { __aicore__ DataCopyParams() { blockCount = DEFAULT_DATA_COPY_NBURST; blockLen = 0; srcStride = DEFAULT_DATA_COPY_STRIDE; dstStride = DEFAULT_DATA_COPY_STRIDE; } __aicore__ DataCopyParams(const uint16_t count, const uint16_t len, const uint16_t srcStrideIn, const uint16_t dstStrideIn) { blockCount = count; blockLen = len; srcStride = srcStrideIn; dstStride = dstStrideIn; } uint16_t blockCount = 0; uint16_t blockLen = 0; uint16_t srcStride = 0; uint16_t dstStride = 0; }; struct SliceInfo { __aicore__ SliceInfo() { startIndex = 0; endIndex = ONE_BLK_SIZE - 1; stride = 0; burstLen = ONE_BLK_SIZE; shapeValue = 0; } __aicore__ SliceInfo(const uint32_t startIndexIn, const uint32_t endIndexIn, const uint32_t strideIn, const uint32_t burstLenIn, const uint32_t shapeValueIn = 0) { startIndex = startIndexIn; endIndex = endIndexIn; stride = strideIn; burstLen = burstLenIn; shapeValue = shapeValueIn; } uint32_t startIndex = 0; uint32_t endIndex = 0; uint32_t stride = 0; uint32_t burstLen = 0; uint32_t shapeValue = 0; }; template __aicore__ inline uint64_t GetScalarBitcodeValue(T scalarValue) { union ScalarBitcode { __aicore__ ScalarBitcode() {} T input; uint64_t output; } data; data.input = scalarValue; return data.output; } template __aicore__ inline half GetScalarBitcodeToHalf(T scalarValue) { union ScalarBitcode { __aicore__ ScalarBitcode() {} T input; half output; } data; data.input = scalarValue; return data.output; } struct DataCopyExtParams { __aicore__ DataCopyExtParams() { blockCount = DEFAULT_DATA_COPY_NBURST; blockLen = 0; srcStride = DEFAULT_DATA_COPY_STRIDE; dstStride = DEFAULT_DATA_COPY_STRIDE; rsv = 0; } __aicore__ DataCopyExtParams(const uint16_t count, const uint32_t len, const uint32_t srcStrideIn, const uint32_t dstStrideIn, const uint32_t rsvIn) { blockCount = count; blockLen = len; srcStride = srcStrideIn; dstStride = dstStrideIn; rsv = rsvIn; } uint16_t blockCount = 0; uint32_t blockLen = 0; uint32_t srcStride = 0; uint32_t dstStride = 0; uint32_t rsv = 0; // reserved information }; template struct DataCopyPadExtParams { __aicore__ DataCopyPadExtParams() { isPad = false; leftPadding = 0; rightPadding = 0; paddingValue = 0; } __aicore__ DataCopyPadExtParams(const bool isPadValue, const uint8_t leftPadValue, const uint8_t rightPadValue, T padValue) { isPad = isPadValue; leftPadding = leftPadValue; rightPadding = rightPadValue; paddingValue = padValue; } bool isPad = false; uint8_t leftPadding = 0; uint8_t rightPadding = 0; T paddingValue = 0; }; struct DataCopyPadParams { __aicore__ DataCopyPadParams() { isPad = false; leftPadding = 0; rightPadding = 0; paddingValue = 0; } __aicore__ DataCopyPadParams(const bool isPadValue, const uint8_t leftPadValue, const uint8_t rightPadValue, const uint64_t padValue) { isPad = isPadValue; leftPadding = leftPadValue; rightPadding = rightPadValue; paddingValue = padValue; } bool isPad = false; uint8_t leftPadding = 0; uint8_t rightPadding = 0; uint64_t paddingValue = 0; }; struct Nd2NzParams { __aicore__ Nd2NzParams() { ndNum = 0; nValue = 0; dValue = 0; srcNdMatrixStride = 0; srcDValue = 0; dstNzC0Stride = 0; dstNzNStride = 0; dstNzMatrixStride = 0; } __aicore__ Nd2NzParams(const uint16_t ndNumIn, const uint16_t nValueIn, const uint16_t dValueIn, const uint16_t srcNdMatrixStrideIn, const uint16_t srcDValueIn, const uint16_t dstNzC0StrideIn, const uint16_t dstNzNStrideIn, const uint16_t dstNzMatrixStrideIn) { ndNum = ndNumIn; nValue = nValueIn; dValue = dValueIn; srcNdMatrixStride = srcNdMatrixStrideIn; srcDValue = srcDValueIn; dstNzC0Stride = dstNzC0StrideIn; dstNzNStride = dstNzNStrideIn; dstNzMatrixStride = dstNzMatrixStrideIn; } uint16_t ndNum = 0; uint16_t nValue = 0; uint16_t dValue = 0; uint16_t srcNdMatrixStride = 0; uint16_t srcDValue = 0; uint16_t dstNzC0Stride = 0; uint16_t dstNzNStride = 0; uint16_t dstNzMatrixStride = 0; }; struct Nz2NdParamsFull { __aicore__ Nz2NdParamsFull() { ndNum = 1; nValue = 0; dValue = 0; srcNdMatrixStride = 1; srcNStride = 0; dstDStride = 0; dstNdMatrixStride = 1; } __aicore__ Nz2NdParamsFull(const uint16_t ndNumIn, const uint16_t nValueIn, const uint16_t dValueIn, const uint16_t srcNdMatrixStrideIn, const uint16_t srcNStrideIn, const uint16_t dstDStrideIn, const uint16_t dstNdMatrixStrideIn) { ndNum = ndNumIn; nValue = nValueIn; dValue = dValueIn; srcNdMatrixStride = srcNdMatrixStrideIn; srcNStride = srcNStrideIn; dstDStride = dstDStrideIn; dstNdMatrixStride = dstNdMatrixStrideIn; } uint16_t ndNum = 1; uint16_t nValue = 0; uint16_t dValue = 0; uint16_t srcNdMatrixStride = 1; uint16_t srcNStride = 0; uint16_t dstDStride = 0; uint16_t dstNdMatrixStride = 1; }; struct CopyRepeatParams { __aicore__ CopyRepeatParams() { dstStride = DEFAULT_DATA_COPY_STRIDE; srcStride = DEFAULT_DATA_COPY_STRIDE; dstRepeatSize = DEFAULT_REPEAT_STRIDE; srcRepeatSize = DEFAULT_REPEAT_STRIDE; } __aicore__ CopyRepeatParams(const uint16_t dstStrideIn, const uint16_t srcStrideIn, uint16_t dstRepeatSizeIn, uint16_t srcRepeatSizeIn) { dstStride = dstStrideIn; srcStride = srcStrideIn; dstRepeatSize = dstRepeatSizeIn; srcRepeatSize = srcRepeatSizeIn; } uint16_t dstStride = DEFAULT_DATA_COPY_STRIDE; uint16_t srcStride = DEFAULT_DATA_COPY_STRIDE; uint16_t dstRepeatSize = DEFAULT_REPEAT_STRIDE; uint16_t srcRepeatSize = DEFAULT_REPEAT_STRIDE; }; struct DataCopyEnhancedParams { __aicore__ DataCopyEnhancedParams() { blockMode = BlockMode::BLOCK_MODE_NORMAL; deqScale = DeqScale::DEQ_NONE; deqValue = 0; sidStoreMode = 0; isRelu = false; padMode = pad_t::PAD_NONE; padValue = 0; } __aicore__ DataCopyEnhancedParams(const BlockMode blockModeIn, const DeqScale deqScaleIn, const uint64_t deqValueIn, const uint8_t sidStoreModeIn, const bool isReluIn, const pad_t padModeIn, const uint64_t padValueIn) { blockMode = blockModeIn; deqScale = deqScaleIn; deqValue = deqValueIn; sidStoreMode = sidStoreModeIn; isRelu = isReluIn; padMode = padModeIn; padValue = padValueIn; } BlockMode blockMode = BlockMode::BLOCK_MODE_NORMAL; DeqScale deqScale = DeqScale::DEQ_NONE; uint64_t deqValue = 0; uint8_t sidStoreMode = 0; bool isRelu = false; pad_t padMode = pad_t::PAD_NONE; uint64_t padValue = 0; uint64_t deqTensorAddr = 0; }; struct DataCopyCO12DstParams { __aicore__ DataCopyCO12DstParams() { sid = 0; nSize = 0; mSize = 0; dstStride = DEFAULT_DATA_COPY_STRIDE; srcStride = DEFAULT_DATA_COPY_STRIDE; unitFlag = 0; quantPre = QuantMode_t::NoQuant; reluPre = 0; channelSplit = false; nz2ndEn = false; } __aicore__ DataCopyCO12DstParams(const uint16_t nSizeIn, const uint16_t mSizeIn, const uint16_t dstStrideIn, const uint32_t srcStrideIn, const QuantMode_t quantPreIn, const uint8_t reluPreIn, const bool channelSplitIn, const bool nz2ndEnIn) { nSize = nSizeIn; mSize = mSizeIn; dstStride = dstStrideIn; srcStride = srcStrideIn; quantPre = quantPreIn; reluPre = reluPreIn; channelSplit = channelSplitIn; nz2ndEn = nz2ndEnIn; } uint8_t sid = 0; uint16_t nSize = 0; uint16_t mSize = 0; uint32_t dstStride = 0; uint16_t srcStride = 0; uint8_t unitFlag = 0; QuantMode_t quantPre = QuantMode_t::NoQuant; uint8_t reluPre = 0; bool channelSplit = false; bool nz2ndEn = false; }; struct QuantParams { __aicore__ QuantParams() {} __aicore__ QuantParams(const QuantMode_t quantPreIn) { quantPre = quantPreIn; } __aicore__ QuantParams(const QuantMode_t quantPreIn, const uint64_t deqScalarIn) { quantPre = quantPreIn; deqScalar = deqScalarIn; } QuantMode_t quantPre = QuantMode_t::NoQuant; uint64_t deqScalar; }; struct Nz2NdParams { __aicore__ Nz2NdParams() { nz2ndEn = false; ndNum = 1; srcNdStride = 0; dstNdStride = 0; originalNSize = 0; } __aicore__ Nz2NdParams(const bool nz2ndEnIn, const uint16_t ndNumIn, const uint16_t srcNdStrideIn, const uint16_t dstNdStrideIn, const uint16_t originalNSizeIn) { nz2ndEn = nz2ndEnIn; ndNum = ndNumIn; srcNdStride = srcNdStrideIn; dstNdStride = dstNdStrideIn; originalNSize = originalNSizeIn; } bool nz2ndEn = false; uint16_t ndNum = 1; uint16_t srcNdStride = 0; uint16_t dstNdStride = 0; uint16_t originalNSize = 0; }; enum class CO2Layout : uint8_t { NZ = 0, ROW_MAJOR, // ND Row COLUMN_MAJOR // ND Column }; struct FixpipeConfig { CO2Layout format; }; constexpr FixpipeConfig CFG_NZ = {CO2Layout::NZ}; constexpr FixpipeConfig CFG_ROW_MAJOR = {CO2Layout::ROW_MAJOR}; constexpr FixpipeConfig CFG_COLUMN_MAJOR = {CO2Layout::COLUMN_MAJOR}; template struct FixpipeParams { __aicore__ FixpipeParams() { cburstNum = DEFAULT_DATA_COPY_NBURST; burstLen = 1; srcStride = DEFAULT_DATA_COPY_STRIDE; dstStride = DEFAULT_DATA_COPY_STRIDE; reluEn = false; unitFlag = 0; } __aicore__ FixpipeParams(const uint16_t count, const uint16_t len, const uint16_t srcStrideIn, const uint32_t dstStrideIn) { cburstNum = count; burstLen = len; dstStride = dstStrideIn; srcStride = srcStrideIn; } uint16_t cburstNum = 0; uint16_t burstLen = 0; uint32_t dstStride = 0; uint16_t srcStride = 0; // extend param QuantParams quantParams; bool reluEn = false; Nz2NdParams nz2ndParams; uint8_t unitFlag = 0; }; struct FixpipeParamsV220 { __aicore__ FixpipeParamsV220() { nSize = 0; mSize = 0; dstStride = 0; srcStride = 0; reluEn = false; unitFlag = 0; } __aicore__ FixpipeParamsV220(const uint16_t nSizeIn, const uint16_t mSizeIn, const uint16_t srcStrideIn, const uint32_t dstStrideIn, const bool reluEnIn) { nSize = nSizeIn; mSize = mSizeIn; srcStride = srcStrideIn; dstStride = dstStrideIn; reluEn = reluEnIn; } __aicore__ FixpipeParamsV220(const uint16_t nSizeIn, const uint16_t mSizeIn, const uint16_t srcStrideIn, const uint32_t dstStrideIn, const bool reluEnIn, const QuantMode_t quantPreIn, const int64_t deqScalarIn, const uint16_t ndNumIn, const uint16_t srcNdStrideIn, const uint16_t dstNdStrideIn, const uint8_t unitFlagIn) { nSize = nSizeIn; mSize = mSizeIn; srcStride = srcStrideIn; dstStride = dstStrideIn; reluEn = reluEnIn; quantPre = quantPreIn; deqScalar = deqScalarIn; ndNum = ndNumIn; srcNdStride = srcNdStrideIn; dstNdStride = dstNdStrideIn; unitFlag = unitFlagIn; } uint16_t nSize = 0; uint16_t mSize = 0; // M-DirectionSize uint16_t srcStride = 0; uint32_t dstStride = 0; // Params: used for Quant QuantMode_t quantPre = QuantMode_t::NoQuant; uint64_t deqScalar; // Params: used for nz2nd uint16_t ndNum = 1; uint16_t srcNdStride = 0; uint16_t dstNdStride = 0; bool reluEn = false; uint8_t unitFlag = 0; }; struct TransDataTo5HDParams { __aicore__ TransDataTo5HDParams() { dstHighHalf = false; srcHighHalf = false; repeatTimes = 1; dstRepStride = 0; srcRepStride = 0; } __aicore__ TransDataTo5HDParams(const bool dstHighHalfIn, const bool srcHighHalfIn, const uint8_t repeatTimesIn, const uint16_t dstRepStrideIn, const uint16_t srcRepStrideIn) { dstHighHalf = dstHighHalfIn; srcHighHalf = srcHighHalfIn; repeatTimes = repeatTimesIn; dstRepStride = dstRepStrideIn; srcRepStride = srcRepStrideIn; } bool dstHighHalf = false; bool srcHighHalf = false; uint8_t repeatTimes = 1; uint16_t dstRepStride = 0; uint16_t srcRepStride = 0; }; enum class TransposeType : uint8_t { // default value TRANSPOSE_TYPE_NONE, // { shape:[B, A1, A3 / 16, A2 / 16, 16, 16], format:"NZ"} -->{ shape:[B, A2, A1, A3], ori_shape:[B, A2, A1, A3], // format:"ND"} TRANSPOSE_NZ2ND_0213, // { shape:[B, A1, A3 / 16, A2 / 16, 16, 16], format:"NZ"}-->{ shape:[B, A2, A3 / 16, A1 / 16, 16, 16], // origin_shape:[B, A2, A1, A3], format:"NZ"} TRANSPOSE_NZ2NZ_0213, // { shape:[B, H / 16, S / 16, 16, 16], format:"NZ"}-->{ shape:[B, N, H/N/16, S / 16, 16, 16], ori_shape:[B, N, S, // H/N], format:"NZ"} TRANSPOSE_NZ2NZ_012_WITH_N, // { shape:[B, H / 16, S / 16, 16, 16], format:"NZ"}-->{ shape:[B, N, S, H/N], ori_shape:[B, N, S, H/N], // format:"ND"} TRANSPOSE_NZ2ND_012_WITH_N, // { shape:[B, N, H/N/16, S/16, 16, 16], format:"NZ"}-->{ shape:[B, S, H], ori_shape:[B, S, H], format:"ND"} TRANSPOSE_NZ2ND_012_WITHOUT_N, // { shape:[B, N, H/N/16, S/16, 16, 16], format:"NZ"}-->{ shape:[B, H/16, S/16, 16, 16], ori_shape:[B, S, H], // format:"NZ"} TRANSPOSE_NZ2NZ_012_WITHOUT_N, TRANSPOSE_ND2ND_ONLY, // { shape:[H, W], format:"ND"} -->{ shape:[W, H], format:"ND"} TRANSPOSE_ND_UB_GM, // [B, N, S, H/N] -> [B, S, H] TRANSPOSE_GRAD_ND_UB_GM, // [B, S, H] -> [B, N, S, H/N] TRANSPOSE_ND2ND_B16, // { shape:[16, 16], format:"ND", dataType: B16} -->{ shape:[16, 16], format:"ND"} TRANSPOSE_NCHW2NHWC, // [ N, C, H, W] -> [N, H, W, C] TRANSPOSE_NHWC2NCHW // [ N, H, W, C] -> [N, C, H, W] }; struct ConfusionTranspose0213Tiling { __aicore__ ConfusionTranspose0213Tiling() { blockSize = 0; shapeB = 0; shapeA1 = 0; alignA3 = 0; alignA2 = 0; widthTiling = 0; newPopSize = 0; newPopH = 0; needSize = 0; mainBlocks = 0; tailSize = 0; alignA2MulAlignA3 = 0; batchOffset = 0; alignA3MulA1 = 0; shapeA1BlockCube = 0; mainOffset = 0; } __aicore__ ConfusionTranspose0213Tiling(uint32_t blockSizeIn, uint32_t shapeBIn, uint32_t shapeA1In, uint32_t alignA3In, uint32_t alignA2In, uint32_t widthTilingIn, uint32_t newPopSizeIn, uint32_t newPopHIn, uint32_t needSizeIn, uint32_t mainBlocksIn, uint32_t tailSizeIn, uint32_t alignA2MulAlignA3In, uint32_t batchOffsetIn, uint32_t alignA3MulA1In, uint32_t shapeA1BlockCubeIn, uint32_t mainOffsetIn) { blockSize = blockSizeIn; shapeB = shapeBIn; shapeA1 = shapeA1In; alignA3 = alignA3In; alignA2 = alignA2In; widthTiling = widthTilingIn; newPopSize = newPopSizeIn; newPopH = newPopHIn; needSize = needSizeIn; mainBlocks = mainBlocksIn; tailSize = tailSizeIn; alignA2MulAlignA3 = alignA2MulAlignA3In; batchOffset = batchOffsetIn; alignA3MulA1 = alignA3MulA1In; shapeA1BlockCube = shapeA1BlockCubeIn; mainOffset = mainOffsetIn; } uint32_t blockSize = 0; uint32_t shapeB = 0; uint32_t shapeA1 = 0; uint32_t alignA3 = 0; uint32_t alignA2 = 0; uint32_t widthTiling = 0; uint32_t newPopSize = 0; uint32_t newPopH = 0; uint32_t needSize = 0; uint32_t mainBlocks = 0; uint32_t tailSize = 0; uint32_t alignA2MulAlignA3 = 0; uint32_t batchOffset = 0; uint32_t alignA3MulA1 = 0; uint32_t shapeA1BlockCube = 0; uint32_t mainOffset = 0; }; struct ConfusionTranspose2NZ012NTiling { __aicore__ ConfusionTranspose2NZ012NTiling() { blockSize = 0; shapeB = 0; shapeN = 0; hnDiv = 0; blockNum = 0; shapeH = 0; hBlockNum = 0; sBlockNum = 0; alignH = 0; alignS = 0; hnDivBlockNum = 0; alignHnDiv = 0; gap = 0; alignsBlockCube = 0; prehBlockNum = 0; dstBatchOffset = 0; srcBatchOffset = 0; } __aicore__ ConfusionTranspose2NZ012NTiling(uint32_t blockSizeIn, uint32_t shapeBIn, uint32_t shapeNIn, uint32_t hnDivIn, uint32_t blockNumIn, uint32_t shapeHIn, uint32_t hBlockNumIn, uint32_t sBlockNumIn, uint32_t alignHIn, uint32_t alignSIn, uint32_t hnDivBlockNumIn, uint32_t alignHnDivIn, uint32_t gapIn, uint32_t alignsBlockCubeIn, uint32_t prehBlockNumIn, uint32_t dstBatchOffsetIn, uint32_t srcBatchOffsetIn) { blockSize = blockSizeIn; shapeB = shapeBIn; shapeN = shapeNIn; hnDiv = hnDivIn; blockNum = blockNumIn; shapeH = shapeHIn; hBlockNum = hBlockNumIn; sBlockNum = sBlockNumIn; alignH = alignHIn; alignS = alignSIn; hnDivBlockNum = hnDivBlockNumIn; alignHnDiv = alignHnDivIn; gap = gapIn; alignsBlockCube = alignsBlockCubeIn; prehBlockNum = prehBlockNumIn; dstBatchOffset = dstBatchOffsetIn; srcBatchOffset = srcBatchOffsetIn; } uint32_t blockSize = 0; uint32_t shapeB = 0; uint32_t shapeN = 0; uint32_t hnDiv = 0; uint32_t blockNum = 0; uint32_t shapeH = 0; uint32_t hBlockNum = 0; uint32_t sBlockNum = 0; uint32_t alignH = 0; uint32_t alignS = 0; uint32_t hnDivBlockNum = 0; uint32_t alignHnDiv = 0; uint32_t gap = 0; uint32_t alignsBlockCube = 0; uint32_t prehBlockNum = 0; uint32_t dstBatchOffset = 0; uint32_t srcBatchOffset = 0; }; struct ConfusionTranspose2ND012NTiling { __aicore__ ConfusionTranspose2ND012NTiling() { blockSize = 0; shapeB = 0; shapeN = 0; hnDiv = 0; shapeH = 0; hBlockNum = 0; sBlockNum = 0; hnDivBlockNum = 0; alignHnDiv = 0; gap = 0; alignsCube = 0; prehBlockNum = 0; alignsMulAlignHnDiv = 0; alignHnDivCube = 0; alignHnDivBlockSize = 0; dstBatchOffset = 0; srcBatchOffset = 0; blockNum = 0; } __aicore__ ConfusionTranspose2ND012NTiling(uint32_t blockSizeIn, uint32_t shapeBIn, uint32_t shapeNIn, uint32_t hnDivIn, uint32_t shapeHIn, uint32_t hBlockNumIn, uint32_t sBlockNumIn, uint32_t hnDivBlockNumIn, uint32_t alignHnDivIn, uint32_t gapIn, uint32_t alignsCubeIn, uint32_t prehBlockNumIn, uint32_t alignsMulAlignHnDivIn, uint32_t alignHnDivCubeIn, uint32_t alignHnDivBlockSizeIn, uint32_t dstBatchOffsetIn, uint32_t srcBatchOffsetIn, uint32_t blockNumIn) { blockSize = blockSizeIn; shapeB = shapeBIn; shapeN = shapeNIn; hnDiv = hnDivIn; shapeH = shapeHIn; hBlockNum = hBlockNumIn; sBlockNum = sBlockNumIn; hnDivBlockNum = hnDivBlockNumIn; alignHnDiv = alignHnDivIn; gap = gapIn; alignsCube = alignsCubeIn; prehBlockNum = prehBlockNumIn; alignsMulAlignHnDiv = alignsMulAlignHnDivIn; alignHnDivCube = alignHnDivCubeIn; alignHnDivBlockSize = alignHnDivBlockSizeIn; dstBatchOffset = dstBatchOffsetIn; srcBatchOffset = srcBatchOffsetIn; blockNum = blockNumIn; } uint32_t blockSize = 0; uint32_t shapeB = 0; uint32_t shapeN = 0; uint32_t hnDiv = 0; uint32_t shapeH = 0; uint32_t hBlockNum = 0; uint32_t sBlockNum = 0; uint32_t hnDivBlockNum = 0; uint32_t alignHnDiv = 0; uint32_t gap = 0; uint32_t alignsCube = 0; uint32_t prehBlockNum = 0; uint32_t alignsMulAlignHnDiv = 0; uint32_t alignHnDivCube = 0; uint32_t alignHnDivBlockSize = 0; uint32_t dstBatchOffset = 0; uint32_t srcBatchOffset = 0; uint32_t blockNum = 0; }; struct ConfusionTranspose012Tiling { __aicore__ ConfusionTranspose012Tiling() { blockSize = 0; shapeB = 0; shapeN = 0; hnDiv = 0; shapeH = 0; hBlockNum = 0; sBlockNum = 0; hnDivBlockNum = 0; alignH = 0; alignsCube = 0; alignhBlockCube = 0; blockSizeMulAlignH = 0; srcBatchOffset = 0; dstBatchOffset = 0; blockNum = 0; } __aicore__ ConfusionTranspose012Tiling(uint32_t blockSizeIn, uint32_t shapeBIn, uint32_t shapeNIn, uint32_t hnDivIn, uint32_t shapeHIn, uint32_t hBlockNumIn, uint32_t sBlockNumIn, uint32_t hnDivBlockNumIn, uint32_t alignHIn, uint32_t alignsCubeIn, uint32_t alignhBlockCubeIn, uint32_t blockSizeMulAlignHIn, uint32_t srcBatchOffsetIn, uint32_t dstBatchOffsetIn, uint32_t blockNumIn) { blockSize = blockSizeIn; shapeB = shapeBIn; shapeN = shapeNIn; hnDiv = hnDivIn; shapeH = shapeHIn; hBlockNum = hBlockNumIn; sBlockNum = sBlockNumIn; hnDivBlockNum = hnDivBlockNumIn; alignH = alignHIn; alignsCube = alignsCubeIn; alignhBlockCube = alignhBlockCubeIn; blockSizeMulAlignH = blockSizeMulAlignHIn; srcBatchOffset = srcBatchOffsetIn; dstBatchOffset = dstBatchOffsetIn; blockNum = blockNumIn; } uint32_t blockSize = 0; uint32_t shapeB = 0; uint32_t shapeN = 0; uint32_t hnDiv = 0; uint32_t shapeH = 0; uint32_t hBlockNum = 0; uint32_t sBlockNum = 0; uint32_t hnDivBlockNum = 0; uint32_t alignH = 0; uint32_t alignsCube = 0; uint32_t alignhBlockCube = 0; uint32_t blockSizeMulAlignH = 0; uint32_t srcBatchOffset = 0; uint32_t dstBatchOffset = 0; uint32_t blockNum = 0; }; struct ConfusionTransposeOnlyTiling { __aicore__ ConfusionTransposeOnlyTiling() { blockSize = 0; height = 0; width = 0; highBlock = 0; stride = 0; repeat = 0; } __aicore__ ConfusionTransposeOnlyTiling(uint32_t blockSizeIn, uint32_t heightIn, uint32_t widthIn, uint32_t highBlockIn, uint32_t strideIn, uint32_t repeatIn) { blockSize = blockSizeIn; height = heightIn; width = widthIn; highBlock = highBlockIn; stride = strideIn; repeat = repeatIn; } uint32_t blockSize = 0; uint32_t height = 0; uint32_t width = 0; uint32_t highBlock = 0; uint32_t stride = 0; uint32_t repeat = 0; }; struct TransposeParamsExt { __aicore__ TransposeParamsExt() { nSize = 0; cSize = 0; hSize = 0; wSize = 0; transposeType = TransposeType::TRANSPOSE_ND2ND_B16; } __aicore__ TransposeParamsExt(const uint16_t nSizeIn, const uint16_t cSizeIn, const uint16_t hSizeIn, const uint16_t wSizeIn, const TransposeType transposeTypeIn) { nSize = nSizeIn; cSize = cSizeIn; hSize = hSizeIn; wSize = wSizeIn; transposeType = transposeTypeIn; } uint16_t nSize = 0; uint16_t cSize = 0; uint16_t hSize = 0; uint16_t wSize = 0; TransposeType transposeType = TransposeType::TRANSPOSE_ND2ND_B16; }; struct GatherMaskParams { __aicore__ GatherMaskParams() { src0BlockStride = DEFAULT_BLK_STRIDE; repeatTimes = 0; src0RepeatStride = DEFAULT_REPEAT_STRIDE; src1RepeatStride = DEFAULT_REPEAT_STRIDE; } __aicore__ GatherMaskParams(const uint8_t src0BlockStrideIn, const uint16_t repeatTimesIn, const uint16_t src0RepeatStrideIn, const uint8_t src1RepeatStrideIn) { src0BlockStride = src0BlockStrideIn; repeatTimes = repeatTimesIn; src0RepeatStride = src0RepeatStrideIn; src1RepeatStride = src1RepeatStrideIn; } uint8_t src0BlockStride = DEFAULT_BLK_STRIDE; uint16_t repeatTimes = 0; uint16_t src0RepeatStride = DEFAULT_REPEAT_STRIDE; uint8_t src1RepeatStride = DEFAULT_REPEAT_STRIDE; }; struct IntriInfo { uint32_t c0Count{ 0 }; uint32_t repeat{ 0 }; uint32_t repeatRounding{ 0 }; uint32_t repeatRemaining{ 0 }; uint32_t tail{ 0 }; }; enum class DataFormat : uint8_t { ND = 0, NZ, NCHW, NC1HWC0, NHWC, }; struct PadParams { __aicore__ PadParams() { leftPad = 0; rightPad = 0; padValue = 0; } __aicore__ PadParams(const uint16_t leftPadIn, const uint16_t rightPadIn, const int32_t padValueIn) { leftPad = leftPadIn; rightPad = rightPadIn; padValue = padValueIn; } uint16_t leftPad = 0; uint16_t rightPad = 0; int32_t padValue = 0; }; struct UnPadParams { __aicore__ UnPadParams() { leftPad = 0; rightPad = 0; } __aicore__ UnPadParams(const uint16_t leftPadIn, const uint16_t rightPadIn) { leftPad = leftPadIn; rightPad = rightPadIn; } uint16_t leftPad = 0; uint16_t rightPad = 0; }; class AscendCUtils { public: __aicore__ static inline int32_t GetBitSize(int32_t byteSize) { return byteSize * ONE_BYTE_BIT_SIZE; } __aicore__ static inline int32_t GetC0Size() { return DEFAULT_C0_SIZE; } __aicore__ static inline int32_t GetC0Count(const int32_t dtypeSize) { ASCENDC_ASSERT((dtypeSize != 0), { KERNEL_LOG(KERNEL_ERROR, "dtypeSize can not be 0"); }); return GetC0Size() / dtypeSize; } __aicore__ static inline int32_t GetDefaultBlockNum() { return DEFAULT_BLK_NUM; } __aicore__ static inline int64_t GetRsvdCnt() { return get_rsvd_cnt(); } template __aicore__ static inline void SetMask(const uint64_t& maskHigh, const uint64_t& maskLow) { if constexpr (!isSetMask) { return; } #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 if (sizeof(T) >= sizeof(int32_t)) { ASCENDC_ASSERT((maskHigh == 0ULL), { KERNEL_LOG(KERNEL_ERROR, "maskHigh must be 0 for type b32 and b64"); }); } ASCENDC_ASSERT(((maskLow != 0ULL) || (maskHigh != 0ULL)), { KERNEL_LOG(KERNEL_ERROR, "maskLow and maskHigh can not be zero at the same time"); }); #endif if ASCEND_IS_NOT_AIC { set_vector_mask(maskHigh, maskLow); } } template __aicore__ static inline void SetMask(int32_t len) { if constexpr (!isSetMask) { return; } int32_t typeLen; if constexpr (IsSameType::value) { typeLen = DEFAULT_BLOCK_SIZE * INT4_TWO; } else { typeLen = DEFAULT_BLOCK_SIZE / sizeof(T); } const int32_t halfTypeLen = 64; if (len == halfTypeLen) { SetMask(0, FULL_MASK); return; } else if (len == typeLen) { SetMask(FULL_MASK, FULL_MASK); return; } else if (len >= halfTypeLen * 2) { SetMask(FULL_MASK, FULL_MASK); return; } SetMask(static_cast( (len > halfTypeLen) ? (((static_cast(1)) << static_cast(len - halfTypeLen)) - 1) : 0), static_cast( (len > halfTypeLen) ? FULL_MASK : (((static_cast(1)) << static_cast(len)) - 1))); return; } template __aicore__ static inline void SetMaskCount() { set_mask_count(); } template __aicore__ static inline void SetMaskNorm() { set_mask_norm(); } #if __CCE_AICORE__ >= 220 __aicore__ static inline void SetOverflow(uint64_t ctrlValue) { // set CTRL[48] is 1 --- inf/nan mode // set CTRL[48] is 0 --- saturated mode if (ctrlValue == 1) { set_ctrl(sbitset1(get_ctrl(), CTRL_48_BIT)); } else { set_ctrl(sbitset0(get_ctrl(), CTRL_48_BIT)); } } #elif __CCE_AICORE__ == 200 __aicore__ static inline void SetOverflow(uint64_t ctrlValue) { // set CTRL[53] is 1 --- saturated mode // set CTRL[53] is 0 --- inf/nan mode if (ctrlValue == 0) { set_ctrl(sbitset1(get_ctrl(), CTRL_53_BIT)); } else { set_ctrl(sbitset0(get_ctrl(), CTRL_53_BIT)); } } #endif template __aicore__ static inline void ResetMask() { if constexpr (!isSetMask) { return; } if ASCEND_IS_NOT_AIC { set_vector_mask(FULL_MASK, FULL_MASK); } } template __aicore__ inline static IntriInfo CalIntriInfo( const uint32_t dtypeSize, const uint32_t calCount, uint32_t repStride = DEFAULT_BLK_NUM) { IntriInfo retIntriInfo; retIntriInfo.c0Count = GetC0Count(dtypeSize); if constexpr (isInt4) { retIntriInfo.c0Count = GetC0Size() * INT4_TWO; } uint32_t repeatCount = repStride * retIntriInfo.c0Count; retIntriInfo.repeat = calCount / repeatCount; retIntriInfo.tail = calCount % repeatCount; retIntriInfo.repeatRounding = retIntriInfo.repeat / MAX_REPEAT_TIMES; retIntriInfo.repeatRemaining = retIntriInfo.repeat % MAX_REPEAT_TIMES; return retIntriInfo; } template __aicore__ static inline __ubuf__ T* GetTemporaryBufferAddr(const int32_t bufferOffset, const int32_t bufferSize) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((bufferOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "bufferOffset is %d, which must be 32B aligned", bufferOffset); }); ASCENDC_ASSERT( (bufferOffset + bufferSize * sizeof(T) <= ConstDefiner::Instance().bufferInitLen.at(Hardware::UB)), { KERNEL_LOG(KERNEL_ERROR, "bufferOffset is %d, bufferSize is %d, which exceed the limit of ub %d", bufferOffset, bufferSize, ConstDefiner::Instance().bufferInitLen.at(Hardware::UB)); }); const int32_t maxTempSize = 0x100000; ASCENDC_ASSERT((bufferSize < maxTempSize), { KERNEL_LOG(KERNEL_ERROR, "bufferSize is %d, which exceed the maxTempSize limits %d", bufferSize, maxTempSize); }); T* addr = reinterpret_cast(ConstDefiner::Instance().hardwareCpuBufferMap.at(Hardware::UB) + bufferOffset); #else (void)bufferSize; __ubuf__ T* addr = reinterpret_cast<__ubuf__ T*>(get_imm(0) + bufferOffset); #endif return addr; } template __aicore__ static inline void FreeTemporaryBuffer(__ubuf__ T* addr) { (void)addr; } #if __CCE_AICORE__ >= 220 template __aicore__ static inline __fbuf__ T* GetTemporaryFbBufferAddr(const int32_t bufferOffset, const int32_t bufferSize) { #if defined(__CCE_KT_TEST__) && __CCE_KT_TEST__ == 1 ASCENDC_ASSERT((bufferOffset % ONE_BLK_SIZE == 0), { KERNEL_LOG(KERNEL_ERROR, "bufferOffset is %d, which must be 32B aligned", bufferOffset); }); ASCENDC_ASSERT( (bufferOffset + bufferSize * sizeof(T) <= ConstDefiner::Instance().bufferInitLen.at(Hardware::FIXBUF)), { KERNEL_LOG(KERNEL_ERROR, "bufferOffset is %d, bufferSize is %d, which exceed the limit of fixbuf %d", bufferOffset, bufferSize, ConstDefiner::Instance().bufferInitLen.at(Hardware::FIXBUF)); }); T* addr = reinterpret_cast(ConstDefiner::Instance().hardwareCpuBufferMap.at(Hardware::FIXBUF) + bufferOffset); #else (void)bufferSize; __fbuf__ T* addr = reinterpret_cast<__fbuf__ T*>(get_imm(0) + bufferOffset); #endif return addr; } template __aicore__ static inline void FreeTemporaryFbBuffer(__fbuf__ T* addr) { (void)addr; } #endif __aicore__ static inline uint64_t GetGMLen(const DataCopyParams& intriParams, const bool& isSrc, const bool& isMovAlignIntri) { uint16_t stride = intriParams.dstStride; uint16_t burstLenUnit = 32; uint16_t strideUnit = 32; if (isSrc) { stride = intriParams.srcStride; } if (isMovAlignIntri) { burstLenUnit = 1; strideUnit = 1; } uint64_t gmLen = static_cast(intriParams.blockCount) * intriParams.blockLen * burstLenUnit + (intriParams.blockCount - 1) * stride * strideUnit; return gmLen; } __aicore__ static inline uint64_t GetGMLen(const DataCopyExtParams& intriParams, const bool& isSrc, const bool& isMovAlignIntri) { uint16_t stride = intriParams.dstStride; uint16_t burstLenUnit = 32; uint16_t strideUnit = 32; if (isSrc) { stride = intriParams.srcStride; } if (isMovAlignIntri) { burstLenUnit = 1; strideUnit = 1; } uint64_t gmLen = static_cast(intriParams.blockCount) * intriParams.blockLen * burstLenUnit + (intriParams.blockCount - 1) * stride * strideUnit; return gmLen; } __aicore__ static inline uint64_t GetGMLen(const uint64_t& srcEleSize, const Nd2NzParams& intriParams) { constexpr uint16_t c0Size = 32; uint64_t gmLen = (static_cast(intriParams.ndNum) - 1) * srcEleSize * intriParams.srcNdMatrixStride + (intriParams.nValue - 1) * intriParams.srcDValue * srcEleSize + intriParams.dValue * srcEleSize; return gmLen; } template __aicore__ static inline void CheckGmMemOverflow(__gm__ T* gmAddr, const bool& isSrc, const uint64_t& gmLen) { #if defined(ASCENDC_OOM) && ASCENDC_OOM == 1 uintptr_t inputOutputAddr = 0; uint64_t inputOutputLen = 0; constexpr uint64_t errCode = 0X5A5A0001; uintptr_t gmAddrConvert = reinterpret_cast(gmAddr); if (g_oomAddrArange.count == 0) { return; } for (uint64_t index = 0; index < g_oomAddrArange.count; index++) { if (g_oomAddrArange.addr[index] == 0 || g_oomAddrArange.len[index] == 0) { continue; } inputOutputAddr = g_oomAddrArange.addr[index]; inputOutputLen = g_oomAddrArange.len[index]; if (gmAddrConvert >= inputOutputAddr && gmAddrConvert < inputOutputAddr + inputOutputLen) { break; } if (index == g_oomAddrArange.count - 1) { #if __CCE_AICORE__ == 300 || __CCE_AICORE__ == 310 trap(); #else trap(errCode); #endif } } if (gmAddrConvert + gmLen > inputOutputAddr + inputOutputLen) { #if __CCE_AICORE__ == 300 || __CCE_AICORE__ == 310 trap(); #else trap(errCode); #endif } #endif } template __aicore__ static inline void CheckGmMemOverflowNormal(__gm__ T* gmAddr, __gm__ uint8_t* workSpace, const bool& isSrc, const uint64_t& isMovAlignIntri, const DataCopyParams& intriParams) { (void)(workSpace); uint64_t gmLen = GetGMLen(intriParams, isSrc, isMovAlignIntri); CheckGmMemOverflow(gmAddr, isSrc, gmLen); } template __aicore__ static inline void CheckGmMemOverflowNormal(__gm__ T* gmAddr, __gm__ uint8_t* workSpace, const bool& isSrc, const uint64_t& isMovAlignIntri, const DataCopyExtParams& intriParams) { (void)(workSpace); uint64_t gmLen = GetGMLen(intriParams, isSrc, isMovAlignIntri); CheckGmMemOverflow(gmAddr, isSrc, gmLen); } template __aicore__ static inline void CheckGmMemOverflowNd2Nz(__gm__ T* gmAddr, __gm__ uint8_t* workSpace, const bool& isSrc, const Nd2NzParams& intriParams) { (void)(workSpace); uint64_t srcEleSize = sizeof(T); uint64_t gmLen = GetGMLen(srcEleSize, intriParams); CheckGmMemOverflow(gmAddr, isSrc, gmLen); } }; #ifdef __CCE_KT_TEST__ enum AtomicType { SUM, MAX, MIN }; extern bool g_isAtomic; extern AtomicType g_atomicType; template __aicore__ inline void DataCopyWithAtomic(__gm__ T* dst, __ubuf__ T* src, const DataCopyParams& intriParams) { if (!g_isAtomic) { return; } const uint16_t nBurst = intriParams.blockCount; const uint16_t lenBurst = intriParams.blockLen; const uint16_t srcStride = intriParams.srcStride; const uint16_t dstStride = intriParams.dstStride; // new one buffer and do add uint32_t dstOffset = 0; uint32_t srcOffset = 0; const int repeat = (lenBurst * ONE_BLK_SIZE + ONE_REPEAT_BYTE_SIZE - 1) / ONE_REPEAT_BYTE_SIZE; for (int index = 0; index < nBurst; ++index) { for (int indexJ = 0; indexJ < lenBurst * ONE_BLK_SIZE / sizeof(T); ++indexJ) { if (g_atomicType == SUM) { *(static_cast(src) + srcOffset + indexJ) = *(static_cast(dst) + dstOffset + indexJ) + *(static_cast(src) + srcOffset + indexJ); } else if (g_atomicType == MAX) { *(static_cast(src) + srcOffset + indexJ) = std::max(*(static_cast(dst) + dstOffset + indexJ), *(static_cast(src) + srcOffset + indexJ)); } else { *(static_cast(src) + srcOffset + indexJ) = std::min(*(static_cast(dst) + dstOffset + indexJ), *(static_cast(src) + srcOffset + indexJ)); } } dstOffset += ((lenBurst + dstStride) * ONE_BLK_SIZE) / sizeof(T); srcOffset += ((lenBurst + srcStride) * ONE_BLK_SIZE) / sizeof(T); } } template __aicore__ inline void DataCopyWithAtomicCom(__gm__ T* dst, __ubuf__ T* src, const DataCopyParams& intriParams) { const uint16_t nBurst = intriParams.blockCount; const uint16_t lenBurst = intriParams.blockLen; const uint16_t srcStride = intriParams.srcStride; const uint16_t dstStride = intriParams.dstStride; const uint16_t halfSize = sizeof(T); // new one buffer and do add uint32_t dstOffset = 0; uint32_t srcOffset = 0; const int repeat = (lenBurst * ONE_BLK_SIZE) / ONE_REPEAT_BYTE_SIZE; const int countInRepeat = (ONE_REPEAT_BYTE_SIZE / halfSize); const int tail = lenBurst * ONE_BLK_SIZE / halfSize - repeat * countInRepeat; for (int index = 0; index < nBurst; ++index) { __ubuf__ T* dstAddr = static_cast<__ubuf__ T*>(src) + srcOffset; __ubuf__ T* src0Addr = static_cast<__ubuf__ T*>(dst) + dstOffset; __ubuf__ T* src1Addr = static_cast<__ubuf__ T*>(src) + srcOffset; #if __CCE_AICORE__ <= 220 if (repeat > 0) { AscendCUtils::SetMask(countInRepeat); if (g_atomicType == SUM) { vadd(static_cast(dstAddr), static_cast(src0Addr), static_cast(src1Addr), repeat, 1, 1, 1, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM); } else if (g_atomicType == MAX) { vmax(static_cast(dstAddr), static_cast(src0Addr), static_cast(src1Addr), repeat, 1, 1, 1, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM); } else { vmin(static_cast(dstAddr), static_cast(src0Addr), static_cast(src1Addr), repeat, 1, 1, 1, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM); } AscendCUtils::ResetMask(); } if (tail != 0) { dstAddr = dstAddr + repeat * countInRepeat; src0Addr = src0Addr + repeat * countInRepeat; src1Addr = src1Addr + repeat * countInRepeat; AscendCUtils::SetMask(tail); if (g_atomicType == SUM) { vadd(static_cast(dstAddr), static_cast(src0Addr), static_cast(src1Addr), 1, 1, 1, 1, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM); } else if (g_atomicType == MAX) { vmax(static_cast(dstAddr), static_cast(src0Addr), static_cast(src1Addr), 1, 1, 1, 1, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM); } else { vmin(static_cast(dstAddr), static_cast(src0Addr), static_cast(src1Addr), 1, 1, 1, 1, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM, DEFAULT_BLK_NUM); } AscendCUtils::ResetMask(); } #endif dstOffset += ((lenBurst + dstStride) * ONE_BLK_SIZE) / halfSize; srcOffset += ((lenBurst + srcStride) * ONE_BLK_SIZE) / halfSize; } } __aicore__ inline void DataCopyWithAtomic(__gm__ half* dst, __ubuf__ half* src, const DataCopyParams& intriParams) { if (!g_isAtomic) { return; } DataCopyWithAtomicCom(dst, src, intriParams); } __aicore__ inline void DataCopyWithAtomic(__gm__ float* dst, __ubuf__ float* src, const DataCopyParams& intriParams) { if (!g_isAtomic) { return; } DataCopyWithAtomicCom(dst, src, intriParams); } #endif // __CCE_KT_TEST__ // BF16 #if __CCE_AICORE__ >= 220 && __CCE_AICORE__ != 310 constexpr uint32_t BF16_TO_FP32_MAN_LEN = 16; __aicore__ inline bfloat16_t ToBfloat16(const float& fVal) { float fNum = fVal; uint16_t uiNum = (*(reinterpret_cast(&fNum))) >> BF16_TO_FP32_MAN_LEN; bfloat16_t bNum = *(reinterpret_cast(&uiNum)); return bNum; } __aicore__ inline float ToFloat(const bfloat16_t& bVal) { bfloat16_t bNum = bVal; uint32_t uiNum = (*(reinterpret_cast(&bNum))) << BF16_TO_FP32_MAN_LEN; float fNum = *(reinterpret_cast(&uiNum)); return fNum; } #endif } // namespace AscendC #endif // ASCENDC_MODULE_UTILS_H