/** * Copyright (c) 2024 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ /*! * \file tiling.h * \brief */ #ifndef _MATMUL_TILING_H_ #define _MATMUL_TILING_H_ #include "kernel_tiling/kernel_tiling.h" #if ASCENDC_CPU_DEBUG #define DEBUG_CODE(T) T #else #define DEBUG_CODE(T) #endif #define ITERATE_SIZE 2 enum class CubeFormat { ND = 0, NZ, ZN, ZZ, NN, ND_ALIGN, SCALAR, VECTOR, }; enum class LayoutMode { NONE = 0, BSNGD, SBNGD, BNGS1S2, NORMAL }; enum class BatchMode { NONE = 0, BATCH_LESS_THAN_L1, BATCH_LARGE_THAN_L1, SINGLE_LARGE_THAN_L1 }; enum class MatmulVersion { NORMAL = 0, MULTI_DATA_LOAD, BASIC_BLOCK, IBSHARE_NORM, }; enum IterateMode : uint8_t { ITERATE_MODE_NORMAL = 0b00000001, ITERATE_MODE_ALL = 0b00000010, ITERATE_MODE_BATCH = 0b00000100, ITERATE_MODE_N_BATCH = 0b00001000, ITERATE_MODE_DEFAULT = 0b11111111, }; struct MatmulConfig { bool doNorm; bool doBasicBlock; bool doMultiDataLoad; // basic MNK could only be valid in basic block mode uint32_t basicM; uint32_t basicN; uint32_t basicK; bool intrinsicsCheck; bool isNBatch; bool enVecND2NZ; // only be valid in special basic block mode bool doSpecialBasicBlock; uint32_t doMTE2Preload; uint32_t singleCoreM; uint32_t singleCoreN; uint32_t singleCoreK; uint32_t stepM; uint32_t stepN; uint32_t baseMN; uint32_t singleCoreMN; bool enUnitFlag = true; // AntiQuant Param bool isPerTensor; bool hasAntiQuantOffset; bool doIBShareNorm; // MDL support stepN == 2 bool doSpecialMDL; bool enableInit = true; BatchMode batchMode; // Add for process performance bool enableEnd = true; bool enableGetTensorC = true; bool enableSetOrgShape = true; bool enableSetBias = true; bool enableSetTail = true; bool enableQuantVector = true; bool enableSetDefineData = true; uint8_t iterateMode = IterateMode::ITERATE_MODE_DEFAULT; }; __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1) { return { .doNorm = true, .doBasicBlock = false, .doMultiDataLoad = false, .basicM = 0, .basicN = 0, .basicK = 0, .intrinsicsCheck = intrinsicsLimit, .isNBatch = batchLoop, .enVecND2NZ = isVecND2NZ, .doSpecialBasicBlock = false, .doMTE2Preload = 0, .singleCoreM = 0, .singleCoreN = 0, .singleCoreK = 0, .stepM = 0, .stepN = 0, .baseMN = 0, .singleCoreMN = 0, .enUnitFlag = true, .isPerTensor = false, .hasAntiQuantOffset = false, .doIBShareNorm = false, .doSpecialMDL = false, .enableInit = true, .batchMode = bmmMode, .enableEnd = true, .enableGetTensorC = true, .enableSetOrgShape = true, .enableSetBias = true, .enableSetTail = true, .enableQuantVector = true, .enableSetDefineData = true, .iterateMode = IterateMode::ITERATE_MODE_DEFAULT }; } __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const uint32_t doMTE2Preload = 0, const bool isVecND2NZ = false, bool isPerTensor = false, bool hasAntiQuantOffset = false, const bool enUnitFlag = false) { return { .doNorm = false, .doBasicBlock = false, .doMultiDataLoad = true, .basicM = 0, .basicN = 0, .basicK = 0, .intrinsicsCheck = intrinsicsLimit, .isNBatch = batchLoop, .enVecND2NZ = isVecND2NZ, .doSpecialBasicBlock = false, .doMTE2Preload = doMTE2Preload, .singleCoreM = 0, .singleCoreN = 0, .singleCoreK = 0, .stepM = 0, .stepN = 0, .baseMN = 0, .singleCoreMN = 0, .enUnitFlag = enUnitFlag, .isPerTensor = isPerTensor, .hasAntiQuantOffset = hasAntiQuantOffset, .doIBShareNorm = false, .doSpecialMDL = false, .enableInit = true, .batchMode = BatchMode::NONE, .enableEnd = true, .enableGetTensorC = true, .enableSetOrgShape = true, .enableSetBias = true, .enableSetTail = true, .enableQuantVector = true, .enableSetDefineData = true, .iterateMode = IterateMode::ITERATE_MODE_DEFAULT }; } __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const uint32_t doMTE2Preload = 0, const bool isVecND2NZ = false, bool isPerTensor = false, bool hasAntiQuantOffset = false) { return { .doNorm = false, .doBasicBlock = false, .doMultiDataLoad = false, .basicM = 0, .basicN = 0, .basicK = 0, .intrinsicsCheck = intrinsicsLimit, .isNBatch = batchLoop, .enVecND2NZ = isVecND2NZ, .doSpecialBasicBlock = false, .doMTE2Preload = doMTE2Preload, .singleCoreM = 0, .singleCoreN = 0, .singleCoreK = 0, .stepM = 0, .stepN = 0, .baseMN = 0, .singleCoreMN = 0, .enUnitFlag = false, .isPerTensor = isPerTensor, .hasAntiQuantOffset = hasAntiQuantOffset, .doIBShareNorm = false, .doSpecialMDL = true, .enableInit = true, .batchMode = BatchMode::NONE, .enableEnd = true, .enableGetTensorC = true, .enableSetOrgShape = true, .enableSetBias = true, .enableSetTail = true, .enableQuantVector = true, .enableSetDefineData = true, .iterateMode = IterateMode::ITERATE_MODE_DEFAULT }; } __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const uint32_t basicN, const uint32_t basicK, const bool intrinsicsLimit = false, const bool batchLoop = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1) { return { .doNorm = false, .doBasicBlock = true, .doMultiDataLoad = false, .basicM = basicM, .basicN = basicN, .basicK = basicK, .intrinsicsCheck = intrinsicsLimit, .isNBatch = batchLoop, .enVecND2NZ = false, .doSpecialBasicBlock = false, .doMTE2Preload = 0, .singleCoreM = 0, .singleCoreN = 0, .singleCoreK = 0, .stepM = 0, .stepN = 0, .baseMN = 0, .singleCoreMN = 0, .enUnitFlag = false, .isPerTensor = false, .hasAntiQuantOffset = false, .doIBShareNorm = false, .doSpecialMDL = false, .enableInit = true, .batchMode = bmmMode, .enableEnd = true, .enableGetTensorC = true, .enableSetOrgShape = true, .enableSetBias = true, .enableSetTail = true, .enableQuantVector = true, .enableSetDefineData = true, .iterateMode = IterateMode::ITERATE_MODE_DEFAULT }; } __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, const uint32_t basicN, const uint32_t basicK, const uint32_t singleCoreM, const uint32_t singleCoreN, const uint32_t singleCoreK, const uint32_t stepM, const uint32_t stepN, const bool intrinsicsLimit = false, const bool batchLoop = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1) { return { .doNorm = false, .doBasicBlock = false, .doMultiDataLoad = false, .basicM = basicM, .basicN = basicN, .basicK = basicK, .intrinsicsCheck = intrinsicsLimit, .isNBatch = batchLoop, .enVecND2NZ = false, .doSpecialBasicBlock = true, .doMTE2Preload = 0, .singleCoreM = singleCoreM, .singleCoreN = singleCoreN, .singleCoreK = singleCoreK, .stepM = stepM, .stepN = stepN, .baseMN = basicM * basicN, .singleCoreMN = singleCoreM * singleCoreN, .enUnitFlag = false, .isPerTensor = false, .hasAntiQuantOffset = false, .doIBShareNorm = false, .doSpecialMDL = false, .enableInit = true, .batchMode = bmmMode, .enableEnd = true, .enableGetTensorC = true, .enableSetOrgShape = true, .enableSetBias = true, .enableSetTail = true, .enableQuantVector = true, .enableSetDefineData = true, .iterateMode = IterateMode::ITERATE_MODE_DEFAULT }; } __aicore__ constexpr MatmulConfig GetIBShareNormConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1) { return { .doNorm = false, .doBasicBlock = false, .doMultiDataLoad = false, .basicM = 0, .basicN = 0, .basicK = 0, .intrinsicsCheck = intrinsicsLimit, .isNBatch = batchLoop, .enVecND2NZ = isVecND2NZ, .doSpecialBasicBlock = false, .doMTE2Preload = false, .singleCoreM = 0, .singleCoreN = 0, .singleCoreK = 0, .stepM = 0, .stepN = 0, .baseMN = 0, .singleCoreMN = 0, .enUnitFlag = true, .isPerTensor = false, .hasAntiQuantOffset = false, .doIBShareNorm = true, .doSpecialMDL = false, .enableInit = true, .batchMode = bmmMode, .enableEnd = true, .enableGetTensorC = true, .enableSetOrgShape = true, .enableSetBias = true, .enableSetTail = true, .enableQuantVector = true, .enableSetDefineData = true, .iterateMode = IterateMode::ITERATE_MODE_DEFAULT }; } constexpr MatmulConfig CFG_NORM = GetNormalConfig(); constexpr MatmulConfig CFG_MDL = GetMDLConfig(); constexpr MatmulConfig MM_CFG_BB = GetBasicConfig(128, 128, 128); constexpr MatmulConfig CFG_IBSHARE_NORM = GetIBShareNormConfig(); enum class IterateOrder { ORDER_M = 0, ORDER_N, UNDEF, }; struct MatrixOffset { int32_t offset; int32_t row, col; int32_t height, width; }; extern int blockidx_; #endif // _MATMUL_TILING_H_