/**
 * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 * This file is a part of the CANN Open Software.
 * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file tiling.h
 * \brief
 */
#ifndef _MATMUL_TILING_H_
#define _MATMUL_TILING_H_
#include "kernel_tiling/kernel_tiling.h"

#if ASCENDC_CPU_DEBUG
#define DEBUG_CODE(T) T
#else
#define DEBUG_CODE(T)
#endif

#define ITERATE_SIZE 2

enum class CubeFormat {
    ND = 0,
    NZ,
    ZN,
    ZZ,
    NN,
    ND_ALIGN,
    SCALAR,
    VECTOR,
};

enum class LayoutMode {
    NONE = 0,
    BSNGD,
    SBNGD,
    BNGS1S2,
    NORMAL
};

enum class BatchMode {
    NONE = 0,
    BATCH_LESS_THAN_L1,
    BATCH_LARGE_THAN_L1,
    SINGLE_LARGE_THAN_L1
};

enum class MatmulVersion {
    NORMAL = 0,
    MULTI_DATA_LOAD,
    BASIC_BLOCK,
    IBSHARE_NORM,
};

enum IterateMode : uint8_t {
    ITERATE_MODE_NORMAL  = 0b00000001,
    ITERATE_MODE_ALL     = 0b00000010,
    ITERATE_MODE_BATCH   = 0b00000100,
    ITERATE_MODE_N_BATCH = 0b00001000,
    ITERATE_MODE_DEFAULT = 0b11111111,
};

struct MatmulConfig {
    bool doNorm;
    bool doBasicBlock;
    bool doMultiDataLoad;
    // basic MNK could only be valid in basic block mode
    uint32_t basicM;
    uint32_t basicN;
    uint32_t basicK;
    bool intrinsicsCheck;
    bool isNBatch;
    bool enVecND2NZ;
     // only be valid in special basic block mode
    bool doSpecialBasicBlock;
    uint32_t doMTE2Preload;
    uint32_t singleCoreM;
    uint32_t singleCoreN;
    uint32_t singleCoreK;
    uint32_t stepM;
    uint32_t stepN;
    uint32_t baseMN;
    uint32_t singleCoreMN;
    bool enUnitFlag = true;
    // AntiQuant Param
    bool isPerTensor;
    bool hasAntiQuantOffset;
    bool doIBShareNorm;
    // MDL support stepN == 2
    bool doSpecialMDL;
    bool enableInit = true;
    BatchMode batchMode;

    // Add for process performance
    bool enableEnd = true;
    bool enableGetTensorC = true;
    bool enableSetOrgShape = true;
    bool enableSetBias = true;
    bool enableSetTail = true;
    bool enableQuantVector = true;
    bool enableSetDefineData = true;
    uint8_t iterateMode = IterateMode::ITERATE_MODE_DEFAULT;
};

__aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
    const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1)
{
    return {
        .doNorm = true,
        .doBasicBlock = false,
        .doMultiDataLoad = false,
        .basicM = 0,
        .basicN = 0,
        .basicK = 0,
        .intrinsicsCheck = intrinsicsLimit,
        .isNBatch = batchLoop,
        .enVecND2NZ = isVecND2NZ,
        .doSpecialBasicBlock = false,
        .doMTE2Preload = 0,
        .singleCoreM = 0,
        .singleCoreN = 0,
        .singleCoreK = 0,
        .stepM = 0,
        .stepN = 0,
        .baseMN = 0,
        .singleCoreMN = 0,
        .enUnitFlag = true,
        .isPerTensor = false,
        .hasAntiQuantOffset = false,
        .doIBShareNorm = false,
        .doSpecialMDL = false,
        .enableInit = true,
        .batchMode = bmmMode,
        .enableEnd = true,
        .enableGetTensorC = true,
        .enableSetOrgShape = true,
        .enableSetBias = true,
        .enableSetTail = true,
        .enableQuantVector = true,
        .enableSetDefineData = true,
        .iterateMode = IterateMode::ITERATE_MODE_DEFAULT
    };
}

__aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
    const uint32_t doMTE2Preload = 0, const bool isVecND2NZ = false, bool isPerTensor = false,
    bool hasAntiQuantOffset = false, const bool enUnitFlag = false)
{
    return {
        .doNorm = false,
        .doBasicBlock = false,
        .doMultiDataLoad = true,
        .basicM = 0,
        .basicN = 0,
        .basicK = 0,
        .intrinsicsCheck = intrinsicsLimit,
        .isNBatch = batchLoop,
        .enVecND2NZ = isVecND2NZ,
        .doSpecialBasicBlock = false,
        .doMTE2Preload = doMTE2Preload,
        .singleCoreM = 0,
        .singleCoreN = 0,
        .singleCoreK = 0,
        .stepM = 0,
        .stepN = 0,
        .baseMN = 0,
        .singleCoreMN = 0,
        .enUnitFlag = enUnitFlag,
        .isPerTensor = isPerTensor,
        .hasAntiQuantOffset = hasAntiQuantOffset,
        .doIBShareNorm = false,
        .doSpecialMDL = false,
        .enableInit = true,
        .batchMode = BatchMode::NONE,
        .enableEnd = true,
        .enableGetTensorC = true,
        .enableSetOrgShape = true,
        .enableSetBias = true,
        .enableSetTail = true,
        .enableQuantVector = true,
        .enableSetDefineData = true,
        .iterateMode = IterateMode::ITERATE_MODE_DEFAULT
    };
}

__aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
    const uint32_t doMTE2Preload = 0, const bool isVecND2NZ = false, bool isPerTensor = false,
    bool hasAntiQuantOffset = false)
{
    return {
        .doNorm = false,
        .doBasicBlock = false,
        .doMultiDataLoad = false,
        .basicM = 0,
        .basicN = 0,
        .basicK = 0,
        .intrinsicsCheck = intrinsicsLimit,
        .isNBatch = batchLoop,
        .enVecND2NZ = isVecND2NZ,
        .doSpecialBasicBlock = false,
        .doMTE2Preload = doMTE2Preload,
        .singleCoreM = 0,
        .singleCoreN = 0,
        .singleCoreK = 0,
        .stepM = 0,
        .stepN = 0,
        .baseMN = 0,
        .singleCoreMN = 0,
        .enUnitFlag = false,
        .isPerTensor = isPerTensor,
        .hasAntiQuantOffset = hasAntiQuantOffset,
        .doIBShareNorm = false,
        .doSpecialMDL = true,
        .enableInit = true,
        .batchMode = BatchMode::NONE,
        .enableEnd = true,
        .enableGetTensorC = true,
        .enableSetOrgShape = true,
        .enableSetBias = true,
        .enableSetTail = true,
        .enableQuantVector = true,
        .enableSetDefineData = true,
        .iterateMode = IterateMode::ITERATE_MODE_DEFAULT
    };
}

__aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const uint32_t basicN,
    const uint32_t basicK, const bool intrinsicsLimit = false, const bool batchLoop = false,
    const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1)
{
    return {
        .doNorm = false,
        .doBasicBlock = true,
        .doMultiDataLoad = false,
        .basicM = basicM,
        .basicN = basicN,
        .basicK = basicK,
        .intrinsicsCheck = intrinsicsLimit,
        .isNBatch = batchLoop,
        .enVecND2NZ = false,
        .doSpecialBasicBlock = false,
        .doMTE2Preload = 0,
        .singleCoreM = 0,
        .singleCoreN = 0,
        .singleCoreK = 0,
        .stepM = 0,
        .stepN = 0,
        .baseMN = 0,
        .singleCoreMN = 0,
        .enUnitFlag = false,
        .isPerTensor = false,
        .hasAntiQuantOffset = false,
        .doIBShareNorm = false,
        .doSpecialMDL = false,
        .enableInit = true,
        .batchMode = bmmMode,
        .enableEnd = true,
        .enableGetTensorC = true,
        .enableSetOrgShape = true,
        .enableSetBias = true,
        .enableSetTail = true,
        .enableQuantVector = true,
        .enableSetDefineData = true,
        .iterateMode = IterateMode::ITERATE_MODE_DEFAULT
    };
}

__aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, const uint32_t basicN,
    const uint32_t basicK, const uint32_t singleCoreM, const uint32_t singleCoreN, const uint32_t singleCoreK,
    const uint32_t stepM, const uint32_t stepN, const bool intrinsicsLimit = false, const bool batchLoop = false,
    const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1)
{
    return {
        .doNorm = false,
        .doBasicBlock = false,
        .doMultiDataLoad = false,
        .basicM = basicM,
        .basicN = basicN,
        .basicK = basicK,
        .intrinsicsCheck = intrinsicsLimit,
        .isNBatch = batchLoop,
        .enVecND2NZ = false,
        .doSpecialBasicBlock = true,
        .doMTE2Preload = 0,
        .singleCoreM = singleCoreM,
        .singleCoreN = singleCoreN,
        .singleCoreK = singleCoreK,
        .stepM = stepM,
        .stepN = stepN,
        .baseMN = basicM * basicN,
        .singleCoreMN = singleCoreM * singleCoreN,
        .enUnitFlag = false,
        .isPerTensor = false,
        .hasAntiQuantOffset = false,
        .doIBShareNorm = false,
        .doSpecialMDL = false,
        .enableInit = true,
        .batchMode = bmmMode,
        .enableEnd = true,
        .enableGetTensorC = true,
        .enableSetOrgShape = true,
        .enableSetBias = true,
        .enableSetTail = true,
        .enableQuantVector = true,
        .enableSetDefineData = true,
        .iterateMode = IterateMode::ITERATE_MODE_DEFAULT
    };
}

__aicore__ constexpr MatmulConfig GetIBShareNormConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
    const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1)
{
    return {
        .doNorm = false,
        .doBasicBlock = false,
        .doMultiDataLoad = false,
        .basicM = 0,
        .basicN = 0,
        .basicK = 0,
        .intrinsicsCheck = intrinsicsLimit,
        .isNBatch = batchLoop,
        .enVecND2NZ = isVecND2NZ,
        .doSpecialBasicBlock = false,
        .doMTE2Preload = false,
        .singleCoreM = 0,
        .singleCoreN = 0,
        .singleCoreK = 0,
        .stepM = 0,
        .stepN = 0,
        .baseMN = 0,
        .singleCoreMN = 0,
        .enUnitFlag = true,
        .isPerTensor = false,
        .hasAntiQuantOffset = false,
        .doIBShareNorm = true,
        .doSpecialMDL = false,
        .enableInit = true,
        .batchMode = bmmMode,
        .enableEnd = true,
        .enableGetTensorC = true,
        .enableSetOrgShape = true,
        .enableSetBias = true,
        .enableSetTail = true,
        .enableQuantVector = true,
        .enableSetDefineData = true,
        .iterateMode = IterateMode::ITERATE_MODE_DEFAULT
    };
}

constexpr MatmulConfig CFG_NORM = GetNormalConfig();
constexpr MatmulConfig CFG_MDL = GetMDLConfig();
constexpr MatmulConfig MM_CFG_BB = GetBasicConfig(128, 128, 128);
constexpr MatmulConfig CFG_IBSHARE_NORM = GetIBShareNormConfig();

enum class IterateOrder {
    ORDER_M = 0,
    ORDER_N,
    UNDEF,
};

struct MatrixOffset {
    int32_t offset;
    int32_t row, col;
    int32_t height, width;
};

extern int blockidx_;

#endif // _MATMUL_TILING_H_