import os import shutil import sys import torch import numpy as np import ascendebug #------------公共环境配置 用户自行修改-----------------# CANN_INSTALL_PATH = "/usr/local/Ascend/" ESL_PATH = '/xxx_esl_path/' # 标准自定义场景需要配置 CUSTOMIZE_PATH = os.path.join(CANN_INSTALL_PATH, "latest/opp/vendors/customize") # built-in场景需要配置 DATA_PATH = '/xxx_built_in_data_path/' REPO_PATH = "/xxx_repo_path_path" # 内源框架场景需要配置 CONTRIB_PATH = "/xxx_op_contrib_code_path" # 核函数直调场景需要配置 KERNEL_SOURCE_FILE = "/xxx/add_custom.cpp" KERNEL_ENTRY_FUNC_NAME = 'add_custom' SOURCE_INCLUDE_PATH = [] def _create_built_in_debug_op(): debug_op = ascendebug.create_debug_op('FlashAttentionScore', 'MixCore', 'Ascend910B1') \ .custom_input('query', 'float16', [24, 144, 1280], os.path.join(DATA_PATH, 'q.bin')) \ .custom_input('key', 'float16', [24, 144, 1280], os.path.join(DATA_PATH, 'k.bin')) \ .custom_input('value', 'float16', [24, 144, 1280], os.path.join(DATA_PATH, 'v.bin')) \ .custom_input('real_shift', 'float16', None, None, ['optional']) \ .custom_input('drop_mask', 'uint8', [1244160], os.path.join(DATA_PATH, 'drop_mask.bin'), ['optional']) \ .custom_input('padding_mask', 'float16', None, None, ['optional']) \ .custom_input('atten_mask', 'bool', None, None, ['optional']) \ .custom_input('prefix', 'int64', None, None, ['optional']) \ .custom_input('actual_seq_qlen', 'int64', None, None, ['optional']) \ .custom_input('actual_seq_kvlen', 'int64', None, None, ['optional']) \ .custom_output('softmax_max', 'float32', [24, 20, 144, 8], None) \ .custom_output('softmax_sum', 'float32', [24, 20, 144, 8], None) \ .custom_output('softmax_out', 'float16', [24, 20, 144, 144], None) \ .custom_output('attention_out', 'float16', [24, 20, 144, 64], os.path.join(DATA_PATH, 'attention_out.bin')) \ .attr('scale_value', 'float', 1.0) \ .attr('keep_prob', 'float', 0.8) \ .attr('pre_tockens', 'int', 2147483647) \ .attr('next_tockens', 'int', 2147483647) \ .attr('head_num', 'int', 20) \ .attr('input_layout', 'string', 'BSH') \ .attr('inner_precise', 'int', 0) return debug_op def built_in(): # 设置日志文件、 清除日志内容 ascendebug.set_log_file('built_in.log', clean=True) # 第一步:构建算子信息 debug_op = _create_built_in_debug_op() # 第二步:创建调试对象 op_executor = ascendebug.create_op_executor(debug_op=debug_op, install_path=CANN_INSTALL_PATH) # 第三步:tiling调测-----test-tiling-start-----# # 两种方式,1、先编译tiling so,再tiling tiling_so = op_executor.compile_builtin_tiling(REPO_PATH) tiling_info = op_executor.run_tiling(tiling_so) #两种方式,2、直接从cann包里获取tiling so,再执行tililng tiling_info = op_executor.run_builtin_tiling() """ tiling_binary tiling的二进制文件 workspace tiling的workspace block_num block_num tiling_key tiling_key """ print("*" * 200) print("=========================built-in tiling end=================================================") print(tiling_info.tiling_bin, tiling_info.tiling_workspace, tiling_info.block_num, tiling_info.tiling_key) print("*" * 200) # 第四步: cpu调测--------test-cpu-start-------# cpu_options = ascendebug.CpuOptions() op_executor.run_builtin_cpu(REPO_PATH, tiling_info, cpu_options) # 第五步 npu 编译(CCEC/OPC) compile compile_npu_options = ascendebug.CompileNpuOptions(syncall=True, npu_compile_type='ccec') name, kernel_file, extern = op_executor.compile_builtin_npu(REPO_PATH, tiling_info.tiling_key, compile_npu_options) print("*" * 200) print(name, kernel_file, extern) print("*" * 200) compile_npu_options = ascendebug.CompileNpuOptions(syncall=True) name, kernel_file, extern = op_executor.compile_builtin_npu(REPO_PATH, tiling_info.tiling_key, compile_npu_options) print("*" * 200) print(name, kernel_file, extern) print("*" * 200) # 第六步 npu 调试 run_npu_options = ascendebug.RunNpuOptions() npu_compile_info = ascendebug.NpuCompileInfo(syncall=extern['cross_core_sync'], task_ration=extern['task_ration']) op_executor.run_npu(kernel_file, run_npu_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print("=========================npu=================================================") print("*" * 200) # 第七步 camodel仿真 compile_npu_options = ascendebug.CompileNpuOptions(syncall=True, simulator=True) name_simulator, kernel_file_simulator, extern = op_executor.compile_builtin_npu(REPO_PATH, tiling_info.tiling_key, compile_npu_options) run_simulator_options = ascendebug.RunSimuOptions(block_num=1) op_executor.run_camodel(kernel_file_simulator, run_simulator_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print("camodel end----") print("*" * 200) # #第八步 esl仿真(只支持x86) op_executor.run_esl(kernel_file_simulator, ESL_PATH, run_simulator_options,\ npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print("esl end----") print("*" * 200) # 第九步 profiling profiling_options = ascendebug.RunProfilingOptions(block_num=24, loop=10, profiling=["PipeUtilization"]) op_executor.run_profiling(kernel_file, profiling_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print("profiling_option----") print("*" * 200) def op_contrib_kernel(x, y): # 设置日志文件、 清除日志内容 ascendebug.set_log_file('op_contrib_kernel.log', clean=True) # 第一步 创建算子信息 debug_op = ascendebug.create_debug_op('ForeachSigmoid', 'VectorCore', 'Ascend910B1') \ .list_custom_input([('x', 'float32', [1, 4], 'xxx/x.bin', [])]) \ .list_custom_output([('y', 'float32', [1, 4], 'xxx/y.bin', [])]) # 第二步:创建调试对象 work_dir工作路径 op_executor = ascendebug.create_op_executor(debug_op=debug_op, install_path=CANN_INSTALL_PATH) tiling_so = op_executor.compile_opcontrib_tiling(CONTRIB_PATH) tiling_info = op_executor.run_tiling(tiling_so) print("*" * 200) print(f"tiling_so----{tiling_so}") print(f"tiling_info----{tiling_info}") print("*" * 200) # cpu cpu_options = ascendebug.CpuOptions() op_executor.run_opcontrib_cpu(CONTRIB_PATH, tiling_info, cpu_options) print("*" * 200) print(f"run_opcontrib_cpu end ----{cpu_options}") print("*" * 200) # compile npu kernel_name, kernel_file, extern = op_executor.compile_opcontrib_npu(CONTRIB_PATH) print("*" * 200) print('compile npu end: op_contrib') print(kernel_name, kernel_file, extern) print("*" * 200) npu_compile_info = ascendebug.NpuCompileInfo(syncall=extern['cross_core_sync'], task_ration=extern['task_ration']) run_npu_options = ascendebug.RunNpuOptions() op_executor.run_npu(kernel_file, run_npu_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print('run npu end: op_contrib') print("*" * 200) run_simulator_options = ascendebug.RunSimuOptions(block_num=1, timeout=1200) op_executor.run_camodel(kernel_file, run_simulator_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print('run camodel end: op_contrib') print("*" * 200) op_executor.run_esl(kernel_file, ESL_PATH, run_simulator_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print('run esl end: op_contrib') print("*" * 200) profiling_options = ascendebug.RunProfilingOptions(block_num=24, loop=10) op_executor.run_profiling(kernel_file, profiling_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print('run profiling end: op_contrib') print("*" * 200) def x1_kernel(x, y): z = x + y # 设置日志文件、 清除日志内容 ascendebug.set_log_file('x1_kernel.log', clean=True) # 创建算子信息 debug_op = ascendebug.create_debug_op('add_custom', 'VectorCore', 'Ascend910B1') \ .scalar_input('tileNumIn', 'uint32', 10) \ .tensor_input('x', x) \ .tensor_input('y', y) \ .tensor_output('z', z) # 创建调试对象 work_dir工作路径 op_executor = ascendebug.create_op_executor(debug_op=debug_op, install_path=CANN_INSTALL_PATH) kernel_info = ascendebug.OpKernelInfo(KERNEL_SOURCE_FILE, KERNEL_ENTRY_FUNC_NAME, SOURCE_INCLUDE_PATH) # npu 编译 print('=' * 50, 'call kernel npu start', '=' * 50) npu_option = ascendebug.CompileNpuOptions() kernel_name, kernel_file, extern = op_executor.compile_call_kernel_npu(kernel_info, npu_option) print("*" * 200) print(kernel_name, kernel_file, extern) print("*" * 200) run_npu_options = ascendebug.RunNpuOptions(block_num=32) npu_compile_info = ascendebug.NpuCompileInfo(syncall=extern['cross_core_sync'], task_ration=extern['task_ration']) op_executor.run_npu(kernel_file, npu_options=run_npu_options, npu_compile_info=npu_compile_info) print('=' * 50, 'call kernel npu end', '=' * 50) print('=' * 50, 'call kernel profiling start', '=' * 50) profiling_options = ascendebug.RunProfilingOptions(block_num=32, loop=10) op_executor.run_profiling(kernel_file, profiling_options, npu_compile_info=npu_compile_info) print('=' * 50, 'call kernel profiling end', '=' * 50) print('=' * 50, 'call kernel camodel start', '=' * 50) npu_option = ascendebug.CompileNpuOptions(simulator=True) kernel_name_simulator, kernel_file_simulator, extern_simulator = op_executor.compile_call_kernel_npu(kernel_info, npu_option) run_simulator_options = ascendebug.RunSimuOptions(block_num=1, timeout=1200) op_executor.run_camodel(kernel_file_simulator, run_simulator_options, npu_compile_info=npu_compile_info) print('=' * 50, 'call kernel camodel end', '=' * 50) op_executor.run_esl(kernel_file, ESL_PATH, run_simulator_options, npu_compile_info=npu_compile_info) print('=' * 50, 'call kernel esl end', '=' * 50) print('=' * 50, 'call kernel printf start', '=' * 50) npu_option = ascendebug.CompileNpuOptions(dump_mode='normal') kernel_name, kernel_file, extern = op_executor.compile_call_kernel_npu(kernel_info, npu_option) run_npu_options = ascendebug.RunNpuOptions(block_num=32) npu_compile_info = ascendebug.NpuCompileInfo(syncall=extern['cross_core_sync'], task_ration=extern['task_ration'], dump_mode='normal') op_executor.run_npu(kernel_file, npu_options=run_npu_options, npu_compile_info=npu_compile_info) print('=' * 50, 'call kernel printf end', '=' * 50) print('=' * 50, 'call kernel cpu start', '=' * 50) cpu_option = ascendebug.CpuOptions() op_executor.run_call_kernel_cpu(kernel_info, 32, cpu_option) print('=' * 50, 'call kernel cpu end', '=' * 50) def x1_kernel_run(): # 第一步 创建 输入 tensor,可使用torch、numpy生成tensor数据 x = np.random.uniform(1, 100, (1, 16384)).astype(np.float16) y = np.random.uniform(1, 100, (1, 16384)).astype(np.float16) x1_kernel(x, y) def op_contrib_run(): # 第一步 创建 输入 tensor,可使用torch、numpy生成tensor数据 x = np.random.uniform(1, 100, (16384)).astype(np.float16) y = np.random.uniform(1, 100, (16384)).astype(np.float16) op_contrib_kernel(x, y) def msopgen(): # 设置日志文件、 清除日志内容 ascendebug.set_log_file('x1_kernel.log', clean=True) # 第一步 创建算子信息 debug_op = ascendebug.create_debug_op('AddCustom', 'VectorCore', 'Ascend910B1') \ .custom_input('x', 'int32', [32], 'xxx/x.bin') \ .custom_input('y', 'int32', [32], 'xxx/y.bin') \ .custom_output('z', 'int32', [32], 'xx/z.bin') \ .attr('mask', 'list_int', [0, 0]) \ .attr('repeatTimes', 'int', 1) \ .attr('dstBlkStride', 'int', 1) \ .attr('src0BlkStride', 'int', 1) \ .attr('src1BlkStride', 'int', 1) \ .attr('dstRepStride', 'int', 8) \ .attr('src0RepStride', 'int', 8) \ .attr('src1RepStride', 'int', 8) \ .attr('calCount', 'int', 3) \ .attr('memory', 'int', 0) # 第二步:创建调试对象 work_dir工作路径 op_executor = ascendebug.create_op_executor(debug_op=debug_op, install_path=CANN_INSTALL_PATH) # 第三步:tiling调测 print("**********************************msopgen tiling**********************************") tiling_info = op_executor.run_custom_tiling(CUSTOMIZE_PATH) print("*" * 200) print(tiling_info.tiling_bin, tiling_info.tiling_workspace, tiling_info.block_num, tiling_info.tiling_key) print("*" * 200) # 第四步:cpu调测--------test-cpu-start-------# cpu_options = ascendebug.CpuOptions() op_executor.run_custom_cpu(CUSTOMIZE_PATH, tiling_info, cpu_options) # 第五步 npu 编译 compile_npu_options = ascendebug.CompileNpuOptions(syncall=True) name, kernel_file, extern = op_executor.compile_custom_npu(CUSTOMIZE_PATH, tiling_info.tiling_key, compile_npu_options) print("*" * 200) print(name, kernel_file, extern) print("*" * 200) # # 第六步 npu 调试 run_npu_options = ascendebug.RunNpuOptions() npu_compile_info = ascendebug.NpuCompileInfo(syncall=extern['cross_core_sync'], task_ration=extern['task_ration']) op_executor.run_npu(kernel_file, run_npu_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print("=========================npu=================================================") print("*" * 200) # npu 打印 compile_npu_options = ascendebug.CompileNpuOptions(syncall=True, dump_mode='normal') name, kernel_file, extern = op_executor.compile_custom_npu(CUSTOMIZE_PATH, tiling_info.tiling_key, compile_npu_options) npu_compile_info = ascendebug.NpuCompileInfo(syncall=extern['cross_core_sync'], task_ration=extern['task_ration'], dump_mode='normal') run_npu_options = ascendebug.RunNpuOptions() op_executor.run_npu(kernel_file, run_npu_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print("=========================npu printf=================================================") print("*" * 200) # 第七步 camodel仿真 compile_npu_options = ascendebug.CompileNpuOptions(syncall=True, simulator=True) name_simulator, kernel_file_simulator, extern_simulator = op_executor.compile_custom_npu(CUSTOMIZE_PATH, tiling_info.tiling_key, compile_npu_options) run_simulator_options = ascendebug.RunSimuOptions(block_num=1, timeout=1200) op_executor.run_camodel(kernel_file_simulator, run_simulator_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) # #第八步 esl仿真(只支持x86) op_executor.run_esl(kernel_file, ESL_PATH, run_simulator_options, \ npu_compile_info=npu_compile_info, tiling_info=tiling_info) # 第九步 profiling profiling_options = ascendebug.RunProfilingOptions(block_num=24, loop=10) op_executor.run_profiling(kernel_file, profiling_options, npu_compile_info=npu_compile_info, tiling_info=tiling_info) print("*" * 200) print("profiling_option----") print("*" * 200) if __name__ == "__main__": # built-in工程 built_in() # 核函数直调工程 x1_kernel_run() # 标准自定义工程 msopgen() # 内源框架工程 op_contrib_run()